def fetch_document_bigrams(self, document_lemmas, number_of_bigrams=100):
        """
        Given a number of lemmas identifying a document, it calculates N bigrams
        found in that document, where N=number_of_bigrams.
        """
        if not self.include_bigrams:
            return []

        bigram = Phrases()
        bigram.add_vocab([document_lemmas])
        bigram_counter = Counter()

        for key in bigram.vocab.keys():
            if key not in STOPWORDS_BYTES:
                if len(key.split("_")) > 1:
                    bigram_counter[key] += bigram.vocab[key]

        bigram_iterators = [
            repeat(bigram, bigram_count)
            for bigram, bigram_count
            in bigram_counter.most_common(number_of_bigrams)
        ]
        found_bigrams = list(chain(*bigram_iterators))
        known_bigrams = [bigram for bigram in found_bigrams if bigram in self.top_bigrams]

        return known_bigrams
コード例 #2
0
def train_phrasal():
    sentences = load_sentences('/nlp/data/romap/unigram_docs/')
    bigram = Phrases()
    for sentence in sentences:
        bigram.add_vocab([sentence])
    print len(sentences)

    model = gensim.models.Word2Vec(bigram[sentences], size=200)
    model.wv.save_word2vec_format(
        '/nlp/data/romap/naacl-pattern/w2v/Collocation-w2v.txt', binary=False)
コード例 #3
0
def write_to_file_chartssa(no_delexi_charts: List[str],
                           all_sents: List[List[str]]) -> None:
    with open(os.path.join('chartssa/original_data/', 'chartssa.box'),
              'w') as g:
        with open(os.path.join('chartssa/original_data/', 'train.box'),
                  'w') as train:
            with open(os.path.join('chartssa/original_data/', 'test.box'),
                      'w') as test:
                with open(os.path.join('chartssa/original_data/', 'valid.box'),
                          'w') as valid:

                    for chart in no_delexi_charts:

                        chart_descs, reversed_chart_descs = turn_chart_info_into_sentences(
                            chart)

                        #pprint(chart_descs)
                        #pprint(reversed_chart_descs)
                        #pprint(all_sents)

                        #!!! all_sents must contain a list of sentences, with each sentence being a list of words
                        bigram = Phrases(all_sents, min_count=1, threshold=10)
                        bigram.add_vocab([["Financial", "Groups"],
                                          ["Law", "Firms"],
                                          ["Computer", "Science"]])
                        #print("aaaaaaaa=",bigram[['Financial', 'Groups', 'are', 'more', 'awsome', 'than', 'law', 'firms', 'and', 'even', 'more', 'awesome', 'than', 'computer', 'science']])
                        #print(bigram.vocab)

                        new_reversed_chart_descs = convert_sentences_to_bigrams(
                            reversed_chart_descs, bigram)
                        #pprint(new_reversed_chart_descs)

                        # chart_lines_senta : all the sentences belonging to chart `chart`
                        chart_lines_sa = generate_files_sa(
                            new_reversed_chart_descs)
                        #pprint(chart_lines_sa)

                        len_all_chart_sentences = len(chart_lines_sa)
                        #print("len=", len_all_chart_sentences)
                        g.write(''.join(chart_lines_sa))

                        for line_idx, chart_line in enumerate(chart_lines_sa):
                            if line_idx in list(range(5)):
                                #print("test=", line_idx)
                                test.write(chart_line)
                            elif line_idx in list(range(5, 10)):
                                #print("valid=", line_idx)
                                valid.write(chart_line)
                            elif line_idx in list(
                                    range(10, len_all_chart_sentences)):
                                #print("train=", line_idx)
                                train.write(chart_line)
コード例 #4
0
ファイル: preprocessing.py プロジェクト: x0rzkov/rssBriefing
def train_phrases(tokenized_corpus):
    # If no pretrained Phrases model is available, instantiate one:
    if not os.path.isfile(os.path.join(module_path, 'models', 'Phrases_model')):
        bigram = Phrases(tokenized_corpus,
                         min_count=5,  # Ignore all words and bigrams with total collected count lower than this value.
                         common_terms=common_terms)  # List of stop words that won’t affect frequency count of expressions containing them.
        save_phrases(bigram)

    # Otherwise load the pretrained model and update it
    else:
        bigram = load_phrases()
        bigram.add_vocab(tokenized_corpus)
        save_phrases(bigram)

    return bigram
コード例 #5
0
ファイル: overkill.py プロジェクト: MaxwellRebo/broca
class OverkillTokenizer(Tokenizer):
    def __init__(self, lemmatize=True, n_jobs=1, bigram=None, trigram=None, min_count=5, threshold=10.):
        self.lemmatize = lemmatize
        self.n_jobs = n_jobs
        self.bigram = bigram
        self.trigram = trigram
        self.min_count = min_count
        self.threshold = threshold

    def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        #print('RAKE tokenizing...')
        pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)

        for i, tdoc in enumerate(pre_tdocs):
            for t in tdoc:
                if t.startswith('one'):
                    print(t)
                    print(i)

        #print('Additional Tokenizing docs...')
        if self.n_jobs == 1:
            tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
        else:
            tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)

        #print('Training bigram...')
        if self.bigram is None:
            self.bigram = Phrases(tdocs,
                                  min_count=self.min_count,
                                  threshold=self.threshold,
                                  delimiter=b' ')
        else:
            self.bigram.add_vocab(tdocs)

        #print('Training trigram...')
        if self.trigram is None:
            self.trigram = Phrases(self.bigram[tdocs],
                                   min_count=self.min_count,
                                   threshold=self.threshold,
                                   delimiter=b' ')
        else:
            self.trigram.add_vocab(self.bigram[tdocs])

        return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]
コード例 #6
0
def train_mwe_model_from_json(articles):
    # if path.exists("./models/model"):
    #     phrases_model = SaveLoad.load("./models/model")
    # else:
    phrases_model = Phrases(common_terms=accepted_connectors, min_count=5)
    for document in articles:

        if 'text' in document.keys():
            text = document["text"]
            phrases_model.add_vocab([text.split(" ")])

        if 'title' in document.keys():
            title = document["title"]
            phrases_model.add_vocab([title.split(" ")])
    # phrases_model.save("./models/model")
    phraser_model = Phraser(phrases_model)
    return phraser_model
コード例 #7
0
def write_to_file_chartssb(no_delexi_charts: List[str],
                           all_sents: List[List[str]]) -> None:
    with open(os.path.join('chartssb/original_data/', 'chartssb.box'),
              'w') as g:
        with open(os.path.join('chartssb/original_data/', 'train.box'),
                  'w') as train:
            with open(os.path.join('chartssb/original_data/', 'test.box'),
                      'w') as test:
                with open(os.path.join('chartssb/original_data/', 'valid.box'),
                          'w') as valid:

                    for chart in no_delexi_charts:

                        chart_descs, _ = turn_chart_info_into_sentences(chart)
                        #print(chart_descs)

                        bigram2 = Phrases(all_sents, min_count=1, threshold=2)
                        bigram2.add_vocab([["Financial", "Groups"],
                                           ["Law", "Firms"],
                                           ["Computer", "Science"]])
                        print("vocab=", bigram2.vocab)
                        chart_infos_sentb = turn_dict_into_sent_b(chart_descs)
                        new_infos = convert_chartssb_to_bigrams(
                            chart_infos_sentb, bigram2)
                        chart_lines_sentb = generate_files_sb(new_infos)
                        len_all_chart_sentences = len(chart_lines_sentb)
                        print("len=", len_all_chart_sentences)
                        g.write(''.join(chart_lines_sentb))

                        for line_idx, chart_line in enumerate(
                                chart_lines_sentb):
                            if line_idx in list(range(5)):
                                #print("test=", line_idx)
                                test.write(chart_line)
                            elif line_idx in list(range(5, 10)):
                                #print("valid=", line_idx)
                                valid.write(chart_line)
                            elif line_idx in list(
                                    range(10, len_all_chart_sentences)):
                                #print("train=", line_idx)
                                train.write(chart_line)
コード例 #8
0
def word2vec_measure():
    article_names = ["expressen", "aftonbladet", "svd", "dn"]  #,
    sentences = []

    for single_article in article_names:

        print(" \n *** " + single_article + " *****")
        articles = db.get_articles(single_article)
        bigram = Phrases()

        for row in articles:
            row = IO.filter_text(row.lower())
            sentence = [
                word for word in row if word not in stopwords.words('swedish')
            ]

            sentences.append(sentence)
            bigram.add_vocab([sentence])

    print(len(sentences))

    num_features = 300  # Word vector dimensionality
    min_word_count = 5  # Minimum word count
    num_workers = 8  # Number of threads to run in parallel
    context = 5  # `context window` is the maximum distance between the current and predicted word within a sentence.
    downsampling = 1e-3  # Downsample setting for frequent words

    # bigram_model = Word2Vec(bigram[sentences], size=100)
    bigram_model = Word2Vec(bigram[sentences], workers=num_workers, \
            size=num_features, sg=1, min_count = min_word_count, \
            window = context, sample = downsampling)

    word2vec_result = bigram_model.most_similar(
        positive=['muslimska_brödraskapet'], topn=200)
    # filepath = prop.word2vec_count+single_article+".tsv"
    filepath = prop.word2vec_count + "all_10.tsv"

    IO.write_tuple(word2vec_result, filepath)
コード例 #9
0
def bigrams_with_gensim(data):
    from gensim.models import Phrases
    bigram = Phrases()
    sentences = []
    for row in data:
        title = row['Headings'].replace('[','').replace(']','').replace("'",'')
        title = title + '.'
        #title = title.replace('--',' -- ')
        sentence = [word for word in nltk.word_tokenize(title.lower())
                    if word not in string.punctuation]
        sentences.append(sentence)
        bigram.add_vocab([sentence])
    bigram_counter = Counter()
    for key in bigram.vocab.keys():
        if key not in stopwords.words("english"):
            spl = re.split(b'\_',key)
            spl = [s for s in spl if s !='']
            if len(spl) > 1:
                bigram_counter[key] += bigram.vocab[key]
    print('Bigrams with gensim')
    for key, counts in bigram_counter.most_common(50):
        print('{}: {}'.format(key, counts))
    return bigram
コード例 #10
0
def trainWord2Vec(fileName, modelName):
    # train word2vec on the two sentences
    file = open("../data/mergedDatasets/" + fileName, "r")
    #sentences = file.read()
    bigram = Phrases()
    lines = []
    for line in file:
        line = str(line.decode('ascii', 'ignore'))
        #replace special charecters with spaces to it does not confuce for words
        for charec in line:
            if charec in [
                    ',', '\'', '.', '-', '_', '!', '|', '@', '#', '$', '%',
                    '^', '*', '~', '(', ')', '{', '}'
            ]:
                line = line.replace(charec, ' ')
        #remove pre and post white spaces
        line = line.strip()

        if len(line) > 0:
            wordArray = line.split(" ")
            wordArray = map(str.lower, wordArray)
            if len(wordArray) > 15:
                lines.append(wordArray)
    sentences = lines

    print sentences

    bigram.add_vocab(sentences)
    trigram = Phrases(bigram[sentences])
    fourgram = Phrases(trigram[bigram[sentences]])
    #for a in bigram.vocab.keys():
    #    if str(a).find("_")>0 :print a
    mymodel = gensim.models.Word2Vec(fourgram[trigram[bigram[sentences]]],
                                     min_count=15,
                                     size=200,
                                     workers=4)
    mymodel.save("../word2VecModels/" + modelName)
コード例 #11
0
                    extra_testing_mat[row - N_TRAINING, cuisine2id[cuisine]] = 1
with open("extra_testing_matrix.pyobject", "wb") as f:
    pickle.dump(extra_testing_mat, f)
finish = time()
print("Complete!")
print("Running time: %.2f seconds" % (finish - start,))
print()


# BIGRAMS & TRIGRAMS
print("Creating n-gram corpus from training corpus...")
start = time()
phrases = Phrases(min_count=3, threshold=10.0)
with open("training_corpus.txt", "rt") as f:
    for line in f:
        phrases.add_vocab([line.rstrip().split()])
    _ = f.seek(0)
    with open("bigram_training_corpus.txt", "wt") as g:
        for line in f:
            word_list = phrases[line.rstrip().split()]
            g.write(" ".join(word_list) + "\n")
phrases = Phrases(min_count=3, threshold=10.0)
with open("bigram_training_corpus.txt", "rt") as f:
    for line in f:
        phrases.add_vocab([line.rstrip().split()])
    _ = f.seek(0)
    with open("trigram_training_corpus.txt", "wt") as g:
        for line in f:
            word_list = phrases[line.rstrip().split()]
            g.write(" ".join(word_list) + "\n")
finish = time()
コード例 #12
0
ファイル: word2vec.py プロジェクト: vliegenthart/TSE-NER
    if len(sys.argv) < 4:
        #print(globals()['__doc__'] % locals())
        sys.exit(1)
    sentence_stream = []
    inp, outp1, outp2 = sys.argv[1:4]
    filesent = open(inp)
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    lines = (sent_detector.tokenize(filesent.read().strip()))
    bigram_transformer = Phrases(min_count=1, threshold=2)
    for line in lines:
        sentence = [
            word.decode("utf-8") for word in nltk.word_tokenize(line.lower())
            if word not in string.punctuation
        ]
        sentence_stream.append(sentence)
        bigram_transformer.add_vocab([sentence])

    #sentence_stream = [doc.split(" ") for doc in lines]

    #bigram_transformer = Phrases(sentence_stream, min_count=1, threshold=2)
    model = Word2Vec(bigram_transformer[sentence_stream],
                     size=100,
                     window=2,
                     min_count=2,
                     workers=multiprocessing.cpu_count(),
                     sg=1)

    # model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=2,
    #                 workers=multiprocessing.cpu_count())

    # trim unneeded model memory = use(much) less RAM
コード例 #13
0
class TopicModel(object):
    '''
    This module preprocesses a corpus of documents and runs
    Latent Dirichlet Allocation (LDA) on a corpus of documents.
    
    Parameters
    ----------
    num_topics: int, default: 100
        input parameter to LDA
    
    min_word_count: int, default: 20
        if a token has fewer than min_word_count occurences 
        in the entire corpus, then it will be pruned from the 
        processed corpus
    
    top_most_common_words: int, default: 10
        prune tokens that are within the top_most_common_words 
        throughout the entire corpus 
    
    min_doc_length: int, default: 40
        if the number of tokens within a processed document 
        is less than min_doc_length, then the document is excluded
    
    max_doc_length: int, default: 1000
        if the number of tokens within a processed document 
        is greater than max_doc_length, then the document is excluded
    
    random_state: default: None
        the random seed for the Gensim LDA object
    
    Attributes
    ----------
    bigramizer: 
        the trained Gensim bigramizer
    
    tokens: 
        list of list of strings
    
    dictionary: 
        mapping from id to token
    
    corpus: 
        bag of words vectorization of the tokens
    
    lda: 
        the Gensim LDA object
      
    dominant_topic_ids: 
        list of dominant topic ids, in decreasing order of dominance
    '''
    def __init__(self,
                 num_topics=100,
                 min_word_count=20,
                 top_most_common_words=10,
                 min_doc_length=40,
                 max_doc_length=1000,
                 random_state=None):
        self.num_topics = num_topics
        self.min_word_count = min_word_count
        self.top_most_common_words = top_most_common_words

        assert max_doc_length > min_doc_length, \
               "max_doc_length must be greater than min_doc_length"
        self.min_doc_length = min_doc_length
        self.max_doc_length = max_doc_length
        self.random_state = random_state

        # natural language processing
        self.stop_words = self.getEnglishStopWords()
        self.bigramizer = Phrases()

    def fit(self, documents):
        '''
        parameters:
          documents: list of strings, each represents a document
        '''

        # tokens, dictionary, corpus for LDA
        self.tokens = self.preProcessCorpus(documents)
        self.dictionary = corpora.Dictionary(self.tokens)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens]

        self.lda = self.getLDA(dictionary=self.dictionary,
                               corpus=self.corpus,
                               num_topics=self.num_topics,
                               random_state=self.random_state)

        self.num_dominant_topics = min(10, self.num_topics)
        self.dominant_topic_ids = self.getDominantTopics(
            self.corpus, self.lda, self.num_dominant_topics)

    def __str__(self):
        description = (
            "topic model:\n\ttoken length = {0:,d}\n\tdictionary length = {1:,d}"
            "\n\tnum_topics = {2:,d}\n\tmin_word_count = {3:,d}"
            "\n\ttop_most_common_words = {4:,d}\n\tmin_doc_length = {5:,d}"
            "\n\tmax_doc_length = {6:,d}")
        return description.format(len(self.tokens), len(self.dictionary),
                                  self.num_topics, self.min_word_count,
                                  self.top_most_common_words,
                                  self.min_doc_length, self.max_doc_length)

    @staticmethod
    def getEnglishStopWords():
        '''
        returns a set of stop words for NLP pre-processing
        from nltk.corpus.stopwords()
        Also, some words and letters are added to the set,
        such as "please", "sincerely", "u", etc...
        '''
        stop_words = set(stopwords.words("english"))

        stop_words.add('please')
        stop_words.add('would')
        stop_words.add('use')
        stop_words.add('also')
        stop_words.add('thank')
        stop_words.add('sincerely')
        stop_words.add('regards')
        stop_words.add('hi')
        stop_words.add('hello')
        stop_words.add('greetings')
        stop_words.add('hey')
        stop_words.add('attachment')
        stop_words.add('attached')
        stop_words.add('attached_file')
        stop_words.add('see')
        stop_words.add('file')
        stop_words.add('comment')
        for item in 'abcdefghijklmnopqrstuvwxyz':
            stop_words.add(item)
        return stop_words

    @staticmethod
    def getFrequencies(tokens):
        """
        input: tokens, a list of list of tokens
        output: a collections.Counter() object that contains token counts
        """
        frequencies = Counter()
        for row in tokens:
            frequencies.update(row)
        return frequencies

    @staticmethod
    def getLowFreqWords(frequencies, countCutOff):
        """
        input: 
          frequencies: a collections.Counter() object
          countCutOff: the minimum frequency below which tokens are added to the set
                       of low frequency tokens
        """
        lowFreqTokens = set()
        for token, freq in frequencies.items():
            if freq <= countCutOff:
                lowFreqTokens.add(token)
        return lowFreqTokens

    def preProcessCorpus(self,
                         documents,
                         min_word_count=None,
                         top_most_common_words=None,
                         min_doc_length=None,
                         max_doc_length=None):
        '''
        this function pre-processes the documents and converts them into a list of list of tokens
        
        input: 
          documents: a list of strings (each string represents a document)
          min_word_count: if the frequency count of a token in the corpus is less 
                          than min_word_count then it is pruned
          top_most_common_words: if the frequency count of a token in the corpus
                                 exceeds top_most_common_words then it is pruned 
          min_doc_length: if the number of tokens within a processed document 
                          is less than min_doc_length, then the document is excluded
          max_doc_length: if the number of tokens within a processed document 
                          is greater than max_doc_length, then the document is excluded
        output:
          a list of list of tokens
        '''
        if min_word_count is None:
            min_word_count = self.min_word_count
        if top_most_common_words is None:
            top_most_common_words = self.top_most_common_words
        if min_doc_length is None:
            min_doc_length = self.min_doc_length
        if max_doc_length is None:
            max_doc_length = self.max_doc_length

        tokens = [tokenizer(document) for document in documents]

        # exclude comments that are longer than max_doc_length
        tokens = [tkn for tkn in tokens if len(tkn) < max_doc_length]

        # train Gensim Phrases model for bigrams
        self.bigramizer.add_vocab(tokens)

        # apply Gensim Phrases model to generate bigrams
        tokens = [self.bigramizer[tkn] for tkn in tokens]

        # exclude stop words
        tokens = [[t for t in tkn if t not in self.stop_words]
                  for tkn in tokens]

        # exclude tokens that are shorter than min_doc_length
        tokens = [tkn for tkn in tokens if len(tkn) > min_doc_length]

        # calculate token frequencies to exclude low and high frequency tokens
        freqs = self.getFrequencies(tokens)
        low_freq_tokens = set(x[0] for x in freqs.items()
                              if x[1] < min_word_count)
        high_freq_tokens = [
            word[0] for word in freqs.most_common(top_most_common_words)
        ]

        tokens = [[t for t in tkn if t not in low_freq_tokens]
                  for tkn in tokens]
        tokens = [[t for t in tkn if t not in high_freq_tokens]
                  for tkn in tokens]

        print('\nnumber of low frequency tokens pruned = {:,d}'\
              .format(len(low_freq_tokens)))
        print('min_word_count = {:d}, top_most_common_words = {:,d}'\
              .format(min_word_count, top_most_common_words))
        print('number of high frequency tokens pruned = {:,d}'\
              .format(len(high_freq_tokens)))
        print('tokens = {:,d} rows'.format(len(tokens)))
        print('text pre-processing is complete\n')
        return tokens

    def getLDA(self,
               dictionary=None,
               corpus=None,
               num_topics=None,
               random_state=None):
        # get LDA for dictionary_all and corpus_all
        print('computing LDA...')

        if dictionary is None:
            dictionary = self.dictionary
        if corpus is None:
            corpus = self.corpus
        if num_topics is None:
            num_topics = self.num_topics

        lda = models.ldamodel.LdaModel(corpus=corpus,
                                       alpha='auto',
                                       id2word=dictionary,
                                       num_topics=num_topics,
                                       random_state=random_state)
        return lda

    def getDominantTopics(self, corpus, lda, num_dominant_topics=None):

        print('computing dominant topics...')
        if corpus is None:
            corpus = self.corpus
        if lda is None:
            lda = self.lda
        if num_dominant_topics is None:
            num_dominant_topics = self.num_dominant_topics

        # get topic weight matrix using lda.inference
        # the matrix has dimensions (num documents) x (num topics)
        inference = lda.inference(corpus)
        inference = inference[
            0]  # the inference is a tuple, need the first term
        num_topics = lda.num_topics

        # find dominant topics across documents (vertical sum)
        column_sum_of_weights = np.sum(inference, axis=0)
        sorted_weight_indices = np.argsort(column_sum_of_weights)
        idx = np.arange(num_topics - num_dominant_topics, num_topics)
        dominant_topic_ids = sorted_weight_indices[idx]

        # the dominant_topic_ids store the ids in descending order of dominance
        dominant_topic_ids = dominant_topic_ids[::-1]

        # convert from numpy array to list and return
        return dominant_topic_ids.tolist()
コード例 #14
0
ファイル: app.py プロジェクト: atran/sense2vec
import bz2
import nltk
from collections import Counter
from gensim.models import Phrases
from gensim.models import Word2Vec
from nltk.corpus import stopwords

sentences = []
bigram = Phrases()

with bz2.BZ2File('./2009.csv.bz2') as file_:
    for i, line in enumerate(file_):
        sentence = [word
                    for word in nltk.word_tokenize(line.decode("utf-8").lower())
                    if word not in string.punctuation]
        sentences.append(sentence)
        bigram.add_vocab([sentence])

bigram_model = Word2Vec(bigram[sentences])
bigram_model_counter = Counter()

bigram_model.save('ok.w2v')

for key in bigram_model.vocab.keys():
    if key not in stopwords.words("english"):
        if len(key.split("_")) > 1:
            bigram_model_counter[key] += bigram_model.vocab[key].count

for key, counts in bigram_model_counter.most_common(50):
    print('{0: <20} {1}'.format(key.encode("utf-8"), counts))
コード例 #15
0
from gensim.models import Phrases
from gensim.test.utils import datapath

bigrams = Phrases(min_count=3, threshold=6)
with open(
        'C:\\Users\\sreek\\PycharmProjects\\SEC-Edgar-Data\\tokenized_file.txt',
        'r'
) as tinf, open(
        'C:\\Users\\sreek\\PycharmProjects\\SEC-Edgar-Data\\bi_gram_corpa.txt',
        'a') as toutf:
    for line in tinf:
        bigrams.add_vocab(line)

        toutf.write(''.join(bigrams[line]))
コード例 #16
0
bigram = Phrases(min_count=10)

print "====================================== Start bigram training ======================================"

# train bigram
for city in Cities:

    print "====================================== City %s bigram training ======================================" % city

    filepath = "CityTextCorpus/%s/*/part-00000" % city

    cityFileList = glob.glob(filepath)

    cityTweets = MySentences(cityFileList)

    bigram.add_vocab(cityTweets)

print "====================================== Start trigram training ======================================"

# create phrase detector for trigram
trigram = Phrases(min_count=5)

# train trigram
for city in Cities:

    print "====================================== City %s trigram training ======================================" % city

    filepath = "CityTextCorpus/%s/*/part-00000" % city

    cityFileList = glob.glob(filepath)
コード例 #17
0
class TopicModel(object):
    '''
    This module preprocesses a corpus of documents and runs
    Latent Dirichlet Allocation (LDA) on a corpus of documents.
    
    Parameters
    ----------
    num_topics: int, default: 100
        input parameter to LDA
    
    min_word_count: int, default: 20
        if a token has fewer than min_word_count occurences 
        in the entire corpus, then it will be pruned from the 
        processed corpus
    
    top_most_common_words: int, default: 10
        prune tokens that are within the top_most_common_words 
        throughout the entire corpus 
    
    min_doc_length: int, default: 40
        if the number of tokens within a processed document 
        is less than min_doc_length, then the document is excluded
    
    max_doc_length: int, default: 1000
        if the number of tokens within a processed document 
        is greater than max_doc_length, then the document is excluded
    
    random_state: default: None
        the random seed for the Gensim LDA object
    
    Attributes
    ----------
    bigramizer: 
        the trained Gensim bigramizer
    
    tokens: 
        list of list of strings
    
    dictionary: 
        mapping from id to token
    
    corpus: 
        bag of words vectorization of the tokens
    
    lda: 
        the Gensim LDA object
      
    dominant_topic_ids: 
        list of dominant topic ids, in decreasing order of dominance
    '''

    def __init__(self, num_topics=100, min_word_count=20, 
                 top_most_common_words=10, min_doc_length=40, 
                 max_doc_length=1000, random_state=None):
        self.num_topics = num_topics
        self.min_word_count = min_word_count
        self.top_most_common_words = top_most_common_words
        
        assert max_doc_length > min_doc_length, \
               "max_doc_length must be greater than min_doc_length"
        self.min_doc_length = min_doc_length
        self.max_doc_length = max_doc_length
        self.random_state = random_state
        
        # natural language processing
        self.stop_words = self.getEnglishStopWords()
        self.bigramizer = Phrases()
        
    def fit(self, documents):
        '''
        parameters:
          documents: list of strings, each represents a document
        '''
        
        # tokens, dictionary, corpus for LDA
        self.tokens = self.preProcessCorpus(documents)
        self.dictionary = corpora.Dictionary(self.tokens)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens]
        
        self.lda = self.getLDA(dictionary=self.dictionary, 
                               corpus=self.corpus, 
                               num_topics=self.num_topics, 
                               random_state=self.random_state)
        
        self.num_dominant_topics=min(10, self.num_topics)
        self.dominant_topic_ids = self.getDominantTopics(self.corpus, 
                                                         self.lda, 
                                                         self.num_dominant_topics)


    def __str__(self):
        description = ("topic model:\n\ttoken length = {0:,d}\n\tdictionary length = {1:,d}"
                       "\n\tnum_topics = {2:,d}\n\tmin_word_count = {3:,d}"
                       "\n\ttop_most_common_words = {4:,d}\n\tmin_doc_length = {5:,d}"
                       "\n\tmax_doc_length = {6:,d}")
        return description.format(len(self.tokens), 
                                  len(self.dictionary),
                                  self.num_topics, 
                                  self.min_word_count, 
                                  self.top_most_common_words, 
                                  self.min_doc_length, 
                                  self.max_doc_length)

    @staticmethod
    def getEnglishStopWords():
        '''
        returns a set of stop words for NLP pre-processing
        from nltk.corpus.stopwords()
        Also, some words and letters are added to the set,
        such as "please", "sincerely", "u", etc...
        '''
        stop_words = set(stopwords.words("english"))
        
        stop_words.add('please')
        stop_words.add('would')
        stop_words.add('use')
        stop_words.add('also')
        stop_words.add('thank')
        stop_words.add('sincerely')
        stop_words.add('regards')
        stop_words.add('hi')
        stop_words.add('hello')
        stop_words.add('greetings')
        stop_words.add('hey')
        stop_words.add('attachment')
        stop_words.add('attached')
        stop_words.add('attached_file')
        stop_words.add('see')
        stop_words.add('file')
        stop_words.add('comment')
        for item in 'abcdefghijklmnopqrstuvwxyz':
            stop_words.add(item)
        return stop_words
    
    
    @staticmethod
    def getFrequencies(tokens):
        """
        input: tokens, a list of list of tokens
        output: a collections.Counter() object that contains token counts
        """
        frequencies = Counter()
        for row in tokens:
            frequencies.update(row)
        return frequencies
    
    @staticmethod
    def getLowFreqWords(frequencies, countCutOff):
        """
        input: 
          frequencies: a collections.Counter() object
          countCutOff: the minimum frequency below which tokens are added to the set
                       of low frequency tokens
        """
        lowFreqTokens = set()
        for token, freq in frequencies.iteritems():
            if freq <= countCutOff:
                lowFreqTokens.add(token)
        return lowFreqTokens


    def preProcessCorpus(self, documents, min_word_count=None, 
                         top_most_common_words=None, min_doc_length=None, 
                         max_doc_length=None):
        '''
        this function pre-processes the documents and converts them into a list of list of tokens
        
        input: 
          documents: a list of strings (each string represents a document)
          min_word_count: if the frequency count of a token in the corpus is less 
                          than min_word_count then it is pruned
          top_most_common_words: if the frequency count of a token in the corpus
                                 exceeds top_most_common_words then it is pruned 
          min_doc_length: if the number of tokens within a processed document 
                          is less than min_doc_length, then the document is excluded
          max_doc_length: if the number of tokens within a processed document 
                          is greater than max_doc_length, then the document is excluded
        output:
          a list of list of tokens
        '''
        if min_word_count is None:
            min_word_count = self.min_word_count
        if top_most_common_words is None:
            top_most_common_words = self.top_most_common_words
        if min_doc_length is None:
            min_doc_length = self.min_doc_length
        if max_doc_length is None:
            max_doc_length = self.max_doc_length
        
        tokens = [tokenizer(document) for document in documents]
        
        # exclude comments that are longer than max_doc_length
        tokens = [tkn for tkn in tokens if len(tkn) < max_doc_length]
        
        # train Gensim Phrases model for bigrams
        self.bigramizer.add_vocab(tokens)
        
        # apply Gensim Phrases model to generate bigrams
        tokens = [self.bigramizer[tkn] for tkn in tokens]
        
        # exclude stop words
        tokens = [[t for t in tkn if t not in self.stop_words] for tkn in tokens]
        
        # exclude tokens that are shorter than min_doc_length
        tokens = [tkn for tkn in tokens if len(tkn) > min_doc_length]
        
        # calculate token frequencies to exclude low and high frequency tokens
        freqs = self.getFrequencies(tokens)
        low_freq_tokens = set(x[0] for x in freqs.iteritems() if x[1] < min_word_count)
        high_freq_tokens = [word[0] for word in freqs.most_common(top_most_common_words)]
        
        tokens =  [[t for t in tkn if t not in low_freq_tokens] for tkn in tokens]
        tokens =  [[t for t in tkn if t not in high_freq_tokens] for tkn in tokens]
        
        print '\nnumber of low frequency tokens pruned = {:,d}'\
              .format(len(low_freq_tokens))
        print 'min_word_count = {:d}, top_most_common_words = {:,d}'\
              .format(min_word_count, top_most_common_words)
        print 'number of high frequency tokens pruned = {:,d}'\
              .format(len(high_freq_tokens))
        print 'tokens = {:,d} rows'.format(len(tokens))
        print 'text pre-processing is complete\n'
        return tokens


    def getLDA(self, dictionary=None, corpus=None, num_topics=None, 
               random_state=None):
        # get LDA for dictionary_all and corpus_all
        print 'computing LDA...'
        
        if dictionary is None:
            dictionary = self.dictionary
        if corpus is None:
            corpus = self.corpus
        if num_topics is None:
            num_topics = self.num_topics
        
        lda = models.ldamodel.LdaModel(corpus=corpus, 
                                       alpha='auto', 
                                       id2word=dictionary, 
                                       num_topics=num_topics,
                                       random_state=random_state)
        return lda


    def getDominantTopics(self, corpus, lda, num_dominant_topics=None):
        
        print 'computing dominant topics...'
        if corpus is None:
            corpus = self.corpus
        if lda is None:
            lda = self.lda
        if num_dominant_topics is None:
            num_dominant_topics = self.num_dominant_topics
        
        # get topic weight matrix using lda.inference
        # the matrix has dimensions (num documents) x (num topics)
        inference = lda.inference(corpus)
        inference = inference[0] # the inference is a tuple, need the first term
        num_topics = lda.num_topics
        
        # find dominant topics across documents (vertical sum)
        column_sum_of_weights = np.sum(inference, axis=0)
        sorted_weight_indices = np.argsort(column_sum_of_weights)
        idx = np.arange(num_topics - num_dominant_topics, num_topics)
        dominant_topic_ids = sorted_weight_indices[idx]
        
        # the dominant_topic_ids store the ids in descending order of dominance
        dominant_topic_ids = dominant_topic_ids[::-1]
        
        # convert from numpy array to list and return
        return dominant_topic_ids.tolist()
コード例 #18
0
abstrct = []
ngram = Phrases()

# creating dataframe
datadf = pd.read_pickle(file_dir)

years = np.array(datadf.Publication_Year)

for i in np.arange(len(datadf.index)):
    texts = [
        word
        for word in nltk.word_tokenize(datadf.iloc[i]['Abstracts'].lower())
        if word not in string.punctuation and word not in stoplist
    ]
    abstrct.append(texts)
    ngram.add_vocab([texts])

# ## Removing unimportant words from the bag of words

# There are also some unimportant words which are not included in the NLTK stopword list. So we have created a text file and put those stopwords and clean our data.

# In[69]:

f = open('new_stop_words.txt', 'r')  # open file in read mode
new_stopwords_list = f.read()  # copy to a string

stoplist += new_stopwords_list.split()

N = 0  #for phrase

# ## Most frequent words in the collection of abstracts
コード例 #19
0
import nltk
from collections import Counter
from gensim.models import Phrases
from gensim.models import Word2Vec
from nltk.corpus import stopwords

sentences = []
bigram = Phrases()

with bz2.BZ2File('./2009.csv.bz2') as file_:
    for i, line in enumerate(file_):
        sentence = [
            word for word in nltk.word_tokenize(line.decode("utf-8").lower())
            if word not in string.punctuation
        ]
        sentences.append(sentence)
        bigram.add_vocab([sentence])

bigram_model = Word2Vec(bigram[sentences])
bigram_model_counter = Counter()

bigram_model.save('ok.w2v')

for key in bigram_model.vocab.keys():
    if key not in stopwords.words("english"):
        if len(key.split("_")) > 1:
            bigram_model_counter[key] += bigram_model.vocab[key].count

for key, counts in bigram_model_counter.most_common(50):
    print('{0: <20} {1}'.format(key.encode("utf-8"), counts))
コード例 #20
0
from gensim.models import Word2Vec
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from collections import Counter

print("Reading input file 'input/audits_with_content.csv'")
with open('input/audits_with_content.csv', 'r') as f:
    reader = csv.reader(f)
    raw_documents = list(reader)

print("Prepare documents")
documents = [doc[2] for doc in raw_documents if doc[2] != '']
sentences = []
bigram = Phrases()

for document in documents:
    raw_text = document.lower()
    tokens = lemmatize(raw_text, stopwords=STOPWORDS)
    sentences.append(tokens)
    bigram.add_vocab([tokens])

bigram_counter = Counter()
for key in bigram.vocab.keys():
    if key not in stopwords.words("english"):
        if len(key.split("_")) > 1:
            bigram_counter[key] += bigram.vocab[key]

for key, counts in bigram_counter.most_common(200):
    print '{0: <20} {1}'.format(key.encode("utf-8"), counts)