def parse(sentence): # remove whitespace at the beginning sline = sentence.strip() # remove % sign sline = sline.strip("%") rline = cleanhtml(sline) tokenized_line = ' '.join(p_tokenize(rline)) is_alpha_word_line = [ word for word in tokenized_line.lower().split() if not word.isdigit() ] return is_alpha_word_line
def __iter__(self): for root, dirs, files in os.walk(self.dirname): for filename in files: file_path = root + '/' + filename for line in open(file_path): sline = line.strip() if sline == "": continue rline = cleanhtml(sline) tokenized_line = ' '.join(p_tokenize(rline)) is_alpha_word_line = [ word for word in tokenized_line.lower().split() if word.isalpha() ] yield is_alpha_word_line
def parse_sent(sentence): """parse sentence to list of words """ # remove whitespace at the beginning sline = sentence.strip() # remove % sign # sline = sline.strip("%") # sline = sline.rstrip("'s") rline = cleanhtml(sline) # tokenize lines tokenized_line = ' '.join(p_tokenize(rline)) # parse digits, remove signs is_alpha_word_line = [ word for word in tokenized_line.lower().split() if word.isalpha() ] return is_alpha_word_line
name = 'computer_age_statis.pdf' file_name = os.path.join(cur_dir + '/data/docs/', name) txt_file = os.path.join(cur_dir, name) sentences = word2vec.Text8Corpus('text8') import nltk.data tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # print '\n-----\n'.join(tokenizer.tokenize(text)) words = [] sentences = n_tokenize.sent_tokenize(text) for line in sentences: sline = line.strip() if sline == "": continue rline = cleanhtml(sline) tokenized_line = ' '.join(p_tokenize(rline)) is_alpha_word_line = [ word for word in tokenized_line.lower().split() if word.isalpha() ] words.append(is_alpha_word_line) common_terms = ["of", "with", "without", "and", "or", "the", "a", "an"] phrases = Phrases(words, min_count=1, threshold=2, common_terms=common_terms) # bigram = Phraser(phrases, common_terms=common_terms) # sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'] # pprint.pprint(bigram[sent]) print('\n') # pprint.pprint(list(bigram[words])) # model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1) # model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2) # model.most_similar(['titular'])