Ejemplo n.º 1
0
def parsing(data, phrases):
    subjective = [] 
    objective = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    val = sent_detector.tokenize(data.strip())

    for sent in val:
        sub_tags = []
        obj_tags = []
        tok_sent = nltk.word_tokenize(sent)
        pos = nltk.pos_tag(tok_sent)
        sent_status = False
        for p in phrases:
            if p in sent:    
                for tags in pos:
                    sub_tags.append(tags[1])
                subjective.append(sub_tags)    
                #print p, "--",  sent, "--", pos
                sent_status = True
                break
        if not sent_status:
            for tags in pos:
                obj_tags.append(tags[1])
            objective.append(obj_tags)
    return subjective, objective
Ejemplo n.º 2
0
Archivo: W2v.py Proyecto: saridsa1/cdc
def ToSentences(data, tokenizer, concept, stem, removeStopwords):
    data = tokenizer.tokenize(data.strip())
    sentences = []
    for s in data:
        if len(s) > 0:
            sentences.append(Tokenization(s, concept, stem, removeStopwords))
    return sentences
Ejemplo n.º 3
0
def checkGrams(data, phrases):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    val = sent_detector.tokenize(data.strip())
    for sent in val:
        tokens = nltk.tokenize.word_tokenize(sent)
        tok_cnt = Counter(ngrams(tokens, 2))
        if p in sent:
Ejemplo n.º 4
0
 def __text_to_sentences(self, data):
     raw_sentences = self.tokenizer.tokenize(data.strip())
     sentences = []
     for raw_sentence in raw_sentences:
         if len(raw_sentence) > 0:
             sentences.append(self.__sentence_to_wordlist(raw_sentence))
     return sentences
Ejemplo n.º 5
0
def get_summary():

	''' Calculate degree centrality and thus the summary of the text
	    output:valid_sentences, matrix, "sentence_links.txt" '''

	global sentenceList,n
	global matrix
	global sentence_similarities
	global valid_sentences
	global sentence_rank
	key_max = max(sentence_similarities.keys(), key=(lambda k: sentence_similarities[k]))
	key_min = min(sentence_similarities.keys(), key=(lambda k: sentence_similarities[k]))
	threshold = float(sentence_similarities[key_max]+sentence_similarities[key_min])/float(2)
	# select the sentences with weights greater than the given threshold
	for i in sentence_similarities:
		if sentence_similarities[i]>threshold:
			nodes=[]
			nodes=i.split()
			valid_sentences.append(nodes)
		
	# fill the matrix with mappings according to similarities
	for i in range(n):
		for lis in valid_sentences:
			if str(i) in lis:
				if str(i) not in matrix:
					matrix[str(i)]=[]
				node = lis[(lis.index(str(i))+1)%2]
				if node not in matrix[str(i)]:
					matrix[str(i)].append(node)
				
	# write the matrix in the file 'sentence_links.txt'
	with open('sentence_links.txt','w') as f:
		for key in matrix:
			f.write(key)
			for i in matrix[key]:
				f.write(" "+i)
			f.write('\n')
		f.close()

	data = os.popen('python 5_pageRank.py sentence_links.txt 2').read()
	#print "\n","######## SUMMARY #######\n	"
	a=data.strip().split('\n')
	d=a[2:]
	d.sort()
	rank_list=[]
	summary = ''
	for i in d:
		summary+= sentenceList[int(i)].capitalize()
	#print "\n"
	print summary
Ejemplo n.º 6
0
def breakTextFile(path_to_file, path_to_test_group, min_len=150):

    # creating the directory
    if not os.path.exists(path_to_test_group):
        os.makedirs(path_to_test_group)

    with open(path_to_file, 'r') as myfile:
        data=myfile.read()

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    current_sentence = ''
    test_count = 1
    for sentence in sent_detector.tokenize(data.strip()):
        current_sentence = current_sentence + ' ' + sentence
        if len(current_sentence) > min_len:
            # write to text file
            output_path = path_to_test_group + '/testcaseno' + str (test_count) + '.txt'
            with open(output_path, "w") as text_file:
                text_file.write(current_sentence[1:])
            test_count = test_count + 1
            current_sentence = ''
Ejemplo n.º 7
0
def data_to_sentences(data, tokenizer, remove_stopwords=False):
    # Function to split a review into parsed sentences. Returns a
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    #print data
    raw_sentences = tokenizer.tokenize(data.strip())

    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( data_to_wordlist( raw_sentence, \
              remove_stopwords ))

    #print sentences
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences
Ejemplo n.º 8
0
def data_to_sentences(data, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    #print data
    raw_sentences = tokenizer.tokenize(data.strip())

    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( data_to_wordlist( raw_sentence, \
              remove_stopwords ))

    #print sentences
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences
Ejemplo n.º 9
0
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()

if __name__ == '__main__':
    folder = sys.argv[1]
    output = sys.argv[2]
    sentTotalCnt = 0
    for root, dirnames, filenames in os.walk(folder):
        for filename in filenames:        
            fp = open(os.path.join(root, filename))
            data = fp.read()
            data = unicode(data.strip(), errors='ignore')
            data = clean_html(data)
            count = 0
            for sentence in tokenizer.tokenize(data):
                sentence= sentence.strip()
                diretory = str(sentTotalCnt/10000).zfill(5)
                pathname = output+'/'+diretory+'/'+filename+'.'+str(count)
                if not os.path.exists(os.path.dirname(pathname)):
                    try:
                        os.makedirs(os.path.dirname(pathname))
                    except OSError as exc: # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise
                writeF = open(pathname, 'w')
                # sentence = unicode(sentence.strip(), errors='ignore')
                writeF.write(sentence.strip())             
Ejemplo n.º 10
0
import nltk.data
import re
import string
with open(
        'C://Users//jbjb//Documents//DATA//weird corpus//extras//poundearly.txt',
        'r',
        encoding='utf-8') as f**k:
    data = f**k.read().replace('\n', ' ')
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
print('\n-----\n'.join(sent_detector.tokenize(data.strip())))
Ejemplo n.º 11
0
def _sentences(data):
    # Filter non-ascii
    data = filter(lambda x: x in printable, data)
    sentences = sent_detector.tokenize(data.strip())
    return sentences
Ejemplo n.º 12
0
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

input_file_name = "input.txt"
output_file_name = "output.txt"

with open(input_file_name, 'r') as input_file:
    data=input_file.read().decode("utf-8").replace('\n', '')
# print '\n---\n'.join(sent_detector.tokenize(data.strip()))

output_file = open(output_file_name, 'w')
for item in sent_detector.tokenize(data.strip()):
  output_file.write("%s\n" % item.encode('utf-8'))
input_file.close()
output_file.close()