def parse(sentence):
    # remove whitespace at the beginning
    sline = sentence.strip()
    # remove % sign
    sline = sline.strip("%")
    rline = cleanhtml(sline)

    tokenized_line = ' '.join(p_tokenize(rline))
    is_alpha_word_line = [
        word for word in tokenized_line.lower().split() if not word.isdigit()
    ]

    return is_alpha_word_line
Example #2
0
    def __iter__(self):
        for root, dirs, files in os.walk(self.dirname):
            for filename in files:
                file_path = root + '/' + filename
                for line in open(file_path):
                    sline = line.strip()
                    if sline == "":
                        continue
                    rline = cleanhtml(sline)

                    tokenized_line = ' '.join(p_tokenize(rline))
                    is_alpha_word_line = [
                        word for word in tokenized_line.lower().split()
                        if word.isalpha()
                    ]
                    yield is_alpha_word_line
Example #3
0
def parse_sent(sentence):
    """parse sentence to list of words
    """
    # remove whitespace at the beginning
    sline = sentence.strip()
    # remove % sign
    # sline = sline.strip("%")
    # sline = sline.rstrip("'s")
    rline = cleanhtml(sline)
    # tokenize lines
    tokenized_line = ' '.join(p_tokenize(rline))
    # parse digits, remove signs
    is_alpha_word_line = [
        word for word in tokenized_line.lower().split() if word.isalpha()
    ]

    return is_alpha_word_line
name = 'computer_age_statis.pdf'
file_name = os.path.join(cur_dir + '/data/docs/', name)
txt_file = os.path.join(cur_dir, name)

sentences = word2vec.Text8Corpus('text8')
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# print '\n-----\n'.join(tokenizer.tokenize(text))
words = []
sentences = n_tokenize.sent_tokenize(text)
for line in sentences:
    sline = line.strip()
    if sline == "":
        continue
    rline = cleanhtml(sline)
    tokenized_line = ' '.join(p_tokenize(rline))
    is_alpha_word_line = [
        word for word in tokenized_line.lower().split() if word.isalpha()
    ]
    words.append(is_alpha_word_line)
common_terms = ["of", "with", "without", "and", "or", "the", "a", "an"]
phrases = Phrases(words, min_count=1, threshold=2, common_terms=common_terms)
# bigram = Phraser(phrases, common_terms=common_terms)
# sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
# pprint.pprint(bigram[sent])
print('\n')
# pprint.pprint(list(bigram[words]))

# model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
# model.most_similar(positive=['woman', 'king'], negative=['man'], topn=2)
# model.most_similar(['titular'])