def testMerge(self):
        d = Dictionary(self.texts)
        f = Dictionary(self.texts[:3])
        g = Dictionary(self.texts[3:])

        f.merge_with(g)
        self.assertEqual(sorted(d.token2id.keys()), sorted(f.token2id.keys()))
Example #2
0
def build_dictionary(db):
    dictionary = Dictionary()
    for article in db.articles.find():
        dictionary.doc2bow(article['clean_text'], allow_update=True)
    # print dictionary
    # dictionary.save('data/cnn.dict') # store the dictionary, for future reference
    return dictionary
Example #3
0
    def similarity_matrix(self):
        """Test similarity_matrix returns expected results."""

        corpus = [["government", "denied", "holiday"], ["holiday", "slowing", "hollingworth"]]
        dictionary = Dictionary(corpus)
        corpus = [dictionary.doc2bow(document) for document in corpus]

        # checking symmetry and the existence of ones on the diagonal
        similarity_matrix = self.similarity_matrix(corpus, dictionary).todense()
        self.assertTrue((similarity_matrix.T == similarity_matrix).all())
        self.assertTrue((np.diag(similarity_matrix) == similarity_matrix).all())

        # checking that thresholding works as expected
        similarity_matrix = self.similarity_matrix(corpus, dictionary, threshold=0.45).todense()
        self.assertEquals(18, np.sum(similarity_matrix == 0))

        # checking that exponent works as expected
        similarity_matrix = self.similarity_matrix(corpus, dictionary, exponent=1.0).todense()
        self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix))

        # checking that nonzero_limit works as expected
        similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=4).todense()
        self.assertEquals(4, np.sum(similarity_matrix == 0))

        similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=3).todense()
        self.assertEquals(20, np.sum(similarity_matrix == 0))
Example #4
0
    def _generate_vocabulary(self):
        vocab = Dictionary()
        session = DBSession()

        i = 0
        for question in session.query(Question).yield_per(self.yield_per):
            i += 1
            if i % self.print_per == 0:
                logger.info('Processed %d / %d questions :: %d unique tokens' % (i, self.n_questions, vocab.num_docs))

            strings = [question.title, question.content] if question.content is not None else [question.title]
            vocab.add_documents([CorpusDictionary.tokenize(s) for s in strings])

        i = 0
        for answer in session.query(Answer).yield_per(self.yield_per):
            i += 1
            if i % self.print_per == 0:
                logger.info('Processed %d / %d answers :: %d unique tokens' % (i, self.n_answers, vocab.num_docs))

            vocab.add_documents([CorpusDictionary.tokenize(answer.content)])

        # commit and close the session
        session.commit()
        session.close()

        return vocab
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--dataset')
    parser.add_argument('-p', '--dataset-path', default=default_dataset_path())
    parser.add_argument('-o', '--output')
    opts = parser.parse_args()

    dataset_name = opts.dataset
    dataset_path = opts.dataset_path
    out_fn = opts.output

    if not out_fn:
        logging.error('--output argument required ...')
        parser.print_usage()
        sys.exit(1)

    if not dataset_name:
        logging.error('--dataset argument required ...')
        parser.print_usage()
        sys.exit(1)

    if dataset_name == 'newsgroups':
        corpus = (preprocess_ng(doc) for doc
                  in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path)))
    if dataset_name == 'ndt':
        dataset = NDTDataset(dataset_path=dataset_path)
        dataset.install()

        corpus = (preprocess_ndt(doc) for doc in dataset)
    else:
        logging.error('Unknown dataset %s ...' % dataset_name)
        sys.exit(1)

    d = Dictionary(corpus)
    d.save_as_text(out_fn, sort_by_word=False)
Example #6
0
def process_text(corpus, stoplist=None, bigrams=None, trigrams=None, keep_all=False, no_below=10, no_above=0.8):
    """
    Extracts text data from the corpus
    Cleans and tokenizes text data
    Computes most frequent phrases, creates a dictionary and converts the corpus to a BOW model
    :param corpus:
    :return: processed corpus with phrases, dictionary and BOW corpus
    """

    logging.info("Cleaned and tokenzed dataset")
    text_dataset = clean_and_tokenize(corpus, stoplist=stoplist, keep_all=keep_all)

    if bigrams is not None:
        bi_grams = Phrases(text_dataset, threshold=bigrams, min_count=no_below)
        text_dataset = bi_grams[text_dataset]
    elif trigrams is not None:
        bi_grams = Phrases(text_dataset, threshold=bigrams)
        tri_grams = Phrases(bi_grams[text_dataset], threshold=trigrams)
        text_dataset = tri_grams[bi_grams[text_dataset]]

    dictionary = Dictionary(text_dataset)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    bow_corpus = [dictionary.doc2bow(text) for text in text_dataset]

    return text_dataset, dictionary, bow_corpus
Example #7
0
class LDA(Step):

    def __init__(self, num_topics):
        self._model = None
        self._dictionary = None
        self._n_topics = num_topics

    def fit(self, filename):
        contents = [x for _, x in Reader(filename)]
        self._dictionary = Dictionary(contents)
        corpus = [self._dictionary.doc2bow(text) for text in contents]
        self._model = LdaModel(corpus, num_topics=self._n_topics)

    def transform(self, filename):
        uuids, vectors = self._transform(filename)
        return uuids, vectors

    def _transform(self, filename):
        vectors = []
        uuids = []
        for uuid, tokens in Reader(filename):
            bow = self._dictionary.doc2bow(tokens)
            lda_probs = {dim: prob for dim, prob in self._model[bow]}
            lda_vec = [lda_probs.get(i, 0) for i in range(self._n_topics)]
            vectors.append(lda_vec)
            uuids.append(uuid)
        return uuids, np.array(vectors)

    @classmethod
    def _read(cls, filename):
        for uuid, tokens in Reader(filename):
            yield ' '.join(tokens)
    def test_saveAsText(self):
        """`Dictionary` can be saved as textfile. """
        tmpf = get_tmpfile('save_dict_test.txt')
        small_text = [
            ["prvé", "slovo"],
            ["slovo", "druhé"],
            ["druhé", "slovo"]]

        d = Dictionary(small_text)

        d.save_as_text(tmpf)
        with codecs.open(tmpf, 'r', encoding='utf-8') as file:
            serialized_lines = file.readlines()
            self.assertEqual(serialized_lines[0], u"3\n")
            self.assertEqual(len(serialized_lines), 4)
            # We do not know, which word will have which index
            self.assertEqual(serialized_lines[1][1:], u"\tdruhé\t2\n")
            self.assertEqual(serialized_lines[2][1:], u"\tprvé\t1\n")
            self.assertEqual(serialized_lines[3][1:], u"\tslovo\t3\n")

        d.save_as_text(tmpf, sort_by_word=False)
        with codecs.open(tmpf, 'r', encoding='utf-8') as file:
            serialized_lines = file.readlines()
            self.assertEqual(serialized_lines[0], u"3\n")
            self.assertEqual(len(serialized_lines), 4)
            self.assertEqual(serialized_lines[1][1:], u"\tslovo\t3\n")
            self.assertEqual(serialized_lines[2][1:], u"\tdruhé\t2\n")
            self.assertEqual(serialized_lines[3][1:], u"\tprvé\t1\n")
    def test_from_corpus(self):
        """build `Dictionary` from an existing corpus"""

        documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]
        stoplist = set('for a of the and to in'.split())
        texts = [[word for word in document.lower().split() if word not in stoplist]
                for document in documents]

        # remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once]
                for text in texts]
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary_from_corpus = Dictionary.from_corpus(corpus)

        #we have to compare values, because in creating dictionary from corpus
        #informations about words are lost
        dict_token2id_vals = sorted(dictionary.token2id.values())
        dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values())
        self.assertEqual(dict_token2id_vals, dict_from_corpus_vals)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)
Example #10
0
File: util.py Project: Badodon/FFNN
def load_data(fname):
    
    print 'input file name:', fname

    target = [] #ラベル
    source = [] #文書ベクトル

    #文書リストを作成
    document_list = []
    word_list = []
    for l in open(fname, 'r').readlines():
        sample = l.strip().split(' ',  1)
        label = sample[0]
        target.append([label]) #ラベル
        word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング
        document_list.append(word_list) #文書ごとの単語リスト
    
    #辞書を作成
    #低頻度と高頻度のワードは除く
    dct = Dictionary(document_list)
    dct.filter_extremes(no_below=3, no_above=0.6)

    #文書のBOWでベクトル化
    for doc in document_list:
        tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] 
        dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0])
        source.append(dense)

    dataset = {}
    dataset['target'] = np.array(target)    
    dataset['source'] = np.array(source)    

    return dataset #, max_len, width
    def test_corpus_summarization(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        # Extract the most important documents.
        selected_documents = summarize_corpus(corpus)

        # They are compared to the method reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
            summary = f.read()
            summary = summary.split('\n')

        # Each sentence in the document selection has to be in the model summary.
        for doc_number, document in enumerate(selected_documents):
            # Retrieves all words from the document.
            words = [dictionary[token_id] for (token_id, count) in document]

            # Asserts that all of them are in a sentence from the model reference.
            self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
def read_project_data(mtc,csc, fname): 
    d1 = Dictionary.load(mtc + ".dict") 
    d2 = Dictionary.load(csc + ".dict")
    #d3 = Dictionary.load('data/postgresql-d4f8dde3-CommitLogCorpus.mallet.dict')
    
    MultiTextCorpus = MalletCorpus(mtc, d1) 
    ChangesetCorpus = MalletCorpus(csc, d2)
    #CommitLogCorpus = MalletCorpus('data/postgresql-d4f8dde3-CommitLogCorpus.mallet', d3)
    
    u1 = set(d1.values())
    u2 = set(d2.values())
    #u3 = set(d3.values())
    
    common = u1.intersection(u2)
    uc_set = (len(u1),len(u2))

    u1_uniq = u1.difference(common)
    u2_uniq = u2.difference(common)
    print(u1_uniq)
    
    fname = "common_words_comparison.txt"
    with open(fname, 'a') as f:
        parts = mtc.split("-")
        f.write(str(parts[0]) + "\n")
        f.write("length of MultiTextCorpus: " + str(len(MultiTextCorpus)) + "\n")
        f.write("length of ChangesetCorpus: " + str(len(ChangesetCorpus)) + "\n" + "\n")
        f.write("(MTC,CSC)  in common" + "\n")
        f.write(str(uc_set) + " " + str(len(common)))
        f.write('\n' + '\n')
class TermFrequency(object):
    """ Computes a term frequency distance_matrix
    """
    def __init__(self, documents):
        logging.log(logging.INFO, "Creating Term Frequency")
        
        self.id2Word = Dictionary(documents)
        self.num_unique_words = len(self.id2Word)
        self.distance_matrix = self.to_term_frequency_matrix(documents)

    def to_term_frequency_vector(self, document):
        return self.id2Word.doc2bow(document)


    def to_binary_vector(self, document):
        tf = self.id2Word.doc2bow(document)
        vect = sparse2full(tf, len(self.id2Word.keys()))
        return np.array( vect > 0, dtype=int ) # concerts to binary

    def to_term_frequency_matrix(self, documents):
            return [self.to_term_frequency_vector(d) for d in documents]

    def binary_matrix(self):
        """ Turns a regular tf distance_matrix into a binary distance_matrix """
        def get_binary_data(val):
            if val <= 0:
                return 0
            return 1
       
        full_matrix = MatrixHelper.gensim_to_python_mdarray(self.distance_matrix, self.num_unique_words)
        return [[get_binary_data(cell)
                for cell in row]
                for row in full_matrix]
def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    parser = OptionParser()
    parser.add_option('-f', '--corpus-file')
    parser.add_option('-p', '--parse-procs', default=1, type=int)
    parser.add_option('-s', '--sublexicalize-procs', default=1, type=int)
    parser.add_option('-t', '--tfidf-model')
    parser.add_option('-v', '--vocabulary')
    parser.add_option('-m', '--model-file')
    opts, args = parser.parse_args()

    corpus_fn = opts.corpus_file or sys.exit()
    n_proc_parse = opts.parse_procs
    n_proc_sublex = opts.sublexicalize_procs
    vocab_fn = opts.vocabulary
    tfidf_fn = opts.tfidf_model
    model_fn = opts.model_file or sys.exit()

    with BZ2File(corpus_fn) as f:
        corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()),
                                      order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex,
                                      create_dictionary=False)

        if vocab_fn and os.path.exists(vocab_fn):
            logging.info("Loading vocabulary from %s" % vocab_fn)
            vocab = Dictionary.load(vocab_fn)
        else:
            logging.info("Creating vocabulary")

            start = time.clock()
            vocab = Dictionary(corpus.get_texts())
            end = time.clock()
            logging.info("Vocabulary created in %d seconds" % (end - start))

            if vocab_fn:
                logging.info("Saving dictionary to %s" % vocab_fn)
                vocab.save(vocab_fn)

        corpus.dictionary = vocab

        corpus.dictionary.filter_extremes(no_below=5, no_above=.8)
        corpus.dictionary.compactify()

        if tfidf_fn and os.path.exists(tfidf_fn):
            logging.info("Reading TF-IDF model from %s" % tfidf_fn)
            tfidf = TfidfModel.load(tfidf_fn)
        else:
            logging.info("creating TF-IDF model")
            tfidf = TfidfModel(corpus)

            if tfidf_fn:
                logging.info("Saving TFF-IDF model to %s" % tfidf_fn)
                tfidf.save(tfidf_fn)

        bow_corpus = (tfidf[art] for art in corpus)

        model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary)

        model.save(model_fn)
def main():
    collection_name = "nips"
    years = xrange(2010, 2015)  # 10 ~ 14
    n_topics = 10
    
    corpus_paths = map(lambda y: 
                       "data/{}-{}.dat".format(collection_name, y),
                       years)
    all_corpus = []
    year2corpus = {}
    for year, path in zip(years, corpus_paths):
        corpus = list(load_line_corpus(path))
        all_corpus.append(proc_corpus(corpus))
        year2corpus[year] = corpus

    all_corpus = list(itertools.chain.from_iterable(all_corpus))

    dictionary = Dictionary(all_corpus)
    all_corpus = [dictionary.doc2bow(doc)
                  for doc in all_corpus]

    import pdb
    pdb.set_trace()

    # print all_corpus
    model = LdaModel(all_corpus, num_topics=n_topics,
                     id2word=dictionary,
                     eval_every=10, passes=100)
    print model.show_topics()
def evaluate_log(context, config):
    logger.info('Evalutating models for: %s' % config.project.name)

    model_fname = config.model_fname % ChangesetCorpus.__name__
    changeset_fname = config.corpus_fname % ChangesetCorpus.__name__
    commit_fname = config.corpus_fname % CommitLogCorpus.__name__

    try:
        commit_id2word = Dictionary.load(commit_fname + '.dict')
        commit_corpus = MalletCorpus(commit_fname,
                                     id2word=commit_id2word)
        changeset_id2word = Dictionary.load(changeset_fname + '.dict')
        changeset_corpus = MalletCorpus(changeset_fname,
                                        id2word=changeset_id2word)
    except:
        error('Corpora not built yet -- cannot evaluate')

    try:
        model = LdaModel.load(model_fname)
        logger.info('Opened previously created model at file %s' % model_fname)
    except:
        error('Cannot evalutate LDA models not built yet!')

    changeset_doc_topic = get_doc_topic(changeset_corpus, model)
    commit_doc_topic = get_doc_topic(commit_corpus, model)

    first_shared = dict()
    for id_ in commit_doc_topic:
        i = 0
        commit_topics = [topic[0] for topic in commit_doc_topic[id_]]
        try:
            changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]]
        except:
            continue

        maximum = 101
        minimum = maximum

        for i, topic in enumerate(commit_topics):
            if topic in changeset_topics:
                j = changeset_topics.index(topic)
                minimum = min(minimum, max(i, j))

        for i, topic in enumerate(changeset_topics):
            if topic in commit_topics:
                j = commit_topics.index(topic)
                minimum = min(minimum, max(i, j))

        first_shared[id_] = minimum

        if minimum == maximum:
            logger.info('No common topics found for %s' % str(id_))
            del first_shared[id_]

    mean = sum(first_shared.values()) / len(first_shared)

    with open('data/evaluate-log-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, mean] + list(first_shared.values()))
Example #17
0
 def load_dictionary(self, filepath):
     dictionary = Dictionary() 
     with open(filepath, "rb") as f: 
         for line in f.readlines():
             # example = SampleTrainingExample(line)
             # context = example.context
             dictionary.add_documents([[word.lower() for word in line.split()]])
     return dictionary
def bag_of_words(lemma):
    "Takes in lemmatised words and returns a bow."
    # Create bag of words from dictionnary
    dictionary = Dictionary(lemma)
    dictionary.save('text.dict')
    # Term frequency–inverse document frequency (TF-IDF)
    bow = [dictionary.doc2bow(l) for l in lemma] # Calculates inverse document counts for all terms
    return (bow, dictionary)
Example #19
0
def save_dictionary(
    dic: corpora.Dictionary,
    filename: str
) -> None:
    dic.save(filename)
    print("saved dictionary: {} items to {}".format(
        len(dic.values()), filename
    ))
Example #20
0
 def test_run(self, data):
     dictionary = Dictionary(data)
     dictionary.filter_extremes(no_above=0.5)
     bags_of_words = [ dictionary.doc2bow(t) for t in data]
     #This can take a while to run:
     lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=2)
     results = self.assemble_topics(lda)
     return results
Example #21
0
def to_corpus(documents):
    """
    Make into a corpus
    @documents:list[list[tuple[str,int]]] of bows
    @returns Dictionary, Corpus
    """
    d = Dictionary()
    corpus = [d.doc2bow(doc, allow_update=True) for doc in documents]
    return d, corpus
Example #22
0
File: G8.py Project: lum4chi/IR
def do_ir2(db, param):
    print 'Computazione di IR2', db, param, '...'

    def words(text):
        stopwords = set(nltk.corpus.stopwords.words('english'))
        return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords]

    class BigramsCorpus:
        def __init__(self, db, collection):
            self.client = MongoClient()[db][collection]

        def __iter__(self):
            for doc in self.client.find():
                yield [doc['_id']]

        def __len__(self):
            return self.client.count()

    bigram_corpus = BigramsCorpus('cordis', 'bi_grams')
    bigrams = Dictionary(bigram_corpus)

    project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}}
    a = [project]
    project_corpus = MongoCorpus('cordis', 'projects', aggregate=a)

    n = max(bigrams.keys())
    dataset = []

    for doc in project_corpus:
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        dataset.append(x)

    alg = KMeans(n_clusters=int(param))
    alg.fit(dataset)

    clusters = defaultdict(list)
    for i, doc in enumerate(project_corpus):
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        p = alg.predict([x])
        clusters[p[0]].append(doc['reference'])

    mongo_clusters = []
    for k, v in clusters.items():
        mongo_clusters.append({'cluster': k, 'projects': v})

    # Mongo da questo errore: InvalidDocument: Cannot encode object: 0
    print mongo_clusters
    # Salva su collezione Mongo
    mongo = MongoClient()['g8']['ir2']
    mongo.insert_many(mongo_clusters)
    print 'Fatto!'
Example #23
0
 def run(self, data):
     wordlists = [corpus.tokenized_contents for corpus in data]
     dictionary = Dictionary(wordlists)
     # dictionary.filter_extremes(no_above=0.5)
     bags_of_words = [ dictionary.doc2bow(t) for t in wordlists]
     #This can take a while to run:
     lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=10)
     results = []
     return self.assemble_topics(lda)
Example #24
0
    def test_low_distinct_words_corpus_summarization_is_empty_list(self):
        text = self._get_text_from_test_data("testlowdistinctwords.txt")

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertEqual(summarize_corpus(corpus), [])
    def test_saveAsText_and_loadFromText(self):
        """`Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        for sort_by_word in [True, False]:
            d = Dictionary(self.texts)
            d.save_as_text(tmpf, sort_by_word=sort_by_word)
            self.assertTrue(os.path.exists(tmpf))

            d_loaded = Dictionary.load_from_text(tmpf)
            self.assertNotEqual(d_loaded, None)
            self.assertEqual(d_loaded.token2id, d.token2id)
Example #26
0
 def setUp(self):
     texts = [[u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'],[u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'],[u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'],[ u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], 
     [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'],[u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'], [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'],[u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'],[u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'], [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'],[u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'],
     ['bank','river','shore','water'],['river','water','flow','fast','tree'],['bank','water','fall','flow'],['bank','bank','water','rain','river'],
     ['river','water','mud','tree'],['money','transaction','bank','finance'],
     ['bank','borrow','money'], ['bank','finance'], ['finance','money','sell','bank'],['borrow','sell'],['bank','loan','sell']]
     # initializing using own LDA sufficient statistics so that we get same results each time.
     sstats = numpy.loadtxt(datapath('sstats_test.txt'))
     dictionary = Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats)
Example #27
0
    def create_dict(self, corpus_file):
        dictionary = Dictionary();
        with open(corpus_file,"rb") as infile:
            lines = infile.readlines() #reads single line from file
		
            for line in lines:
                doc = line  #.split() #doc as bag of words (bow) of tokens in this line            
                dictionary.add_documents([doc])
        #infile.close()
        
        return dictionary
Example #28
0
    def test_saveAsText_and_loadFromText(self):
        """ `Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        d = Dictionary(self.texts)
        d.save_as_text(tmpf)
        # does the file exists
        self.assertTrue(os.path.exists(tmpf))

        d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt'))
        self.assertNotEqual(d_loaded, None)
        self.assertEqual(d_loaded.token2id, d.token2id)
    def test_doc2bow(self):
        d = Dictionary([["žluťoučký"], ["žluťoučký"]])

        # pass a utf8 string
        self.assertEqual(d.doc2bow(["žluťoučký"]), [(0, 1)])

        # doc2bow must raise a TypeError if passed a string instead of array of strings by accident
        self.assertRaises(TypeError, d.doc2bow, "žluťoučký")

        # unicode must be converted to utf8
        self.assertEqual(d.doc2bow([u'\u017elu\u0165ou\u010dk\xfd']), [(0, 1)])
    def testMallet2ModelOn20NewsGroups(self):
        corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")]
        dictionary = Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]

        lda_mallet_model = ldamallet.LdaMallet(
            self.mallet_path, corpus=corpus,
            num_topics=20, id2word=dictionary, iterations=500)

        lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000)
        self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
Example #31
0
 
 LTP_DATA_DIR = 'E:/Program Files/workspace/ltp_data_v3.4.0'  # ltp模型目录的路径
 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
 segmentor = Segmentor()  # 初始化实例
 segmentor.load_with_lexicon(cws_model_path, LTP_DATA_DIR+'/user_dict.txt') # 加载模型,第二个参数是您的外部词典文件路径
 postagger = Postagger() # 初始化实例
 postagger.load(pos_model_path)  # 加载模型
 
 combain_comtent = []
 for file in file_list:
     combain_comtent.append(get_content(file))
     
 segmentor.release()  # 释放模型
 
 dictionary = Dictionary(combain_comtent)
 corpus = [ dictionary.doc2bow(text) for text in combain_comtent]
 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=31)
 
 #词典的保存
 dictionary.save_as_text(write_path+"dictionary.txt")
 # lda模型保存
 lda.save(write_path+"model")
 
 for file in lda.print_topics(31):
     print(file[0])
 
 
 topic_list = []
 for i in lda.get_document_topics(corpus):
     listj=[]
Example #32
0
    def out_corp_dic(self, text):

        dictionary = Dictionary(text)
        corpus = [dictionary.doc2bow(doc) for doc in text]

        return {"dictionary": dictionary, "corpus": corpus}
Example #33
0
print(string_similar('安定区妇幼保健站', '定西市安定区妇幼保健站'))
print(string_similar('柬埔寨特大新闻', '柬埔寨新闻'))

from jieba import lcut
from gensim.similarities import SparseMatrixSimilarity
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
# 文本集和搜索词
texts = [
    '吃鸡这里所谓的吃鸡并不是真的吃鸡,也不是谐音词刺激的意思', '而是出自策略射击游戏《绝地求生:大逃杀》里的台词', '我吃鸡翅,你吃鸡腿'
]
keyword = '玩过吃鸡?今晚一起吃鸡'
# 1、将【文本集】生成【分词列表】
texts = [lcut(text) for text in texts]
# 2、基于文本集建立【词典】,并获得词典特征数
dictionary = Dictionary(texts)
num_features = len(dictionary.token2id)
# 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
corpus = [dictionary.doc2bow(text) for text in texts]
# 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
kw_vector = dictionary.doc2bow(lcut(keyword))
# 4、创建【TF-IDF模型】,传入【语料库】来训练
tfidf = TfidfModel(corpus)
# 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
tf_kw = tfidf[kw_vector]
# 6、相似度计算
sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
similarities = sparse_matrix.get_similarities(tf_kw)
for e, s in enumerate(similarities, 1):
    print('kw 与 text%d 相似度为:%.2f' % (e, s))
Example #34
0
assuming you want to use a local copy of the corpus, just read:
/tutorials/_0/pr_wsj_ft.json
'''
'''
assuming the corpus of text is stored in a Mongo DB
'''
## --+ open pipeline
#client = MongoClient()
## --+ pick-up db
#db = client.digitalTechs
## --+ load the data
#df = pd.DataFrame(list(db.press_releases.find()))

# dictionary (saved in tutorial _0)
in_f = os.path.join('transformation', '.data', 'pr_dictionary.dict')
dictionary = Dictionary.load(in_f)

# corpus (saved in tutorial _0)
in_f = os.path.join('transformation', '.data', 'pr_corpus.mm')
corpus = MmCorpus(in_f)

# docs phrased (saved in tutorial _0)
in_f = os.path.join('transformation', '.data', 'pr_docs_phrased.pickle')
with open(in_f, 'rb') as pipe:
    docs_phrased = pickle.load(pipe)

# %% clean data with document attributes

# basic cleaning
# --+ get timespans
df.loc[:, 'year'] = df['date'].dt.year
Example #35
0
 def _build_dictionary(documents: List[List[str]],
                       filter_parameters: Dict[Any, Any]) -> Dictionary:
     d = Dictionary(documents)
     d.filter_extremes(**filter_parameters)
     return d
def preprocess_dataset(dataset: pd.DataFrame, extreme_no_below: int,
                       extreme_no_above: float, enable_bigram: bool,
                       min_bigram_count: int, basic_word_analysis: bool,
                       lemmatizing: bool, temporality: str, language: str,
                       path_to_texts_for_embedding: str,
                       split_by_paragraph: bool) -> Dict[str, Any]:
    """Node for preprocessing the UN General Debates dataset.
        Parameters are taken from conf/base/parameters.yml.
        The data and the parameters will be loaded and provided to this function
        automatically when the pipeline is executed and it is time to run this node.

        Args:
            dataset: Source data. Must have a column named "text" to be processed. Dataset must be in catalog.yml
        Returns:
            Preprocessed dataset,
            vocabulary size,
            dictionnary,
            date range
        Parameters :
            extreme_no_below : if >1 : for a word w, delete this word from vocabulary if w in less than extreme_no_below documents. if in [0,1], for a word w, delete this word from vocabulary if w in less than extreme_no_below% documents
            extreme_no_above : in [0,1], for a word w, delete this word from vocabulary if w in more than extreme_no_below% documents
            enable_bigram : Boolean, decide if you want bigrams or not in the dictionary
            min_bigram_count : Int, threshold for bigrams :  Bigram will be added to the dictionary if in more than min_bigram_count documents
            basic_word_analysis : Boolean, set to True if you want to print some basic word anaylis (basically the number of words removed from each preprocces steps.)
            lemmatizing : Boolean, set to True if lemmatizing is wanted
            temporality : 'year', 'month' or 'week' according to desired time slices
            language : source language for the corpus
            path_to_texts_for_embedding : txt file containting materials for fasttext training
            split_by_paragraph : boolean set to True if documents need to be split by paragraphs
        """
    t0 = time()

    print('\n\nCurrent set of parameters :\n')
    print('\textreme_no_below : {}'.format(extreme_no_below))
    print('\textreme_no_above : {}'.format(extreme_no_above))
    print('\tenable_bigram : {}'.format(enable_bigram))
    print('\tmin_bigram_count : {}'.format(min_bigram_count))
    print('\tlemmatizing : {}'.format(lemmatizing))
    print('\ttemporality : {}'.format(temporality))
    print('\tlanguage : {}\n'.format(language))
    print('\nStart preprocessing of dataset')

    if "text" not in dataset.columns:
        raise ValueError(
            'Dataset does not have a column named "text". You must rename the your text column to "text".'
        )
    if "timestamp" not in dataset.columns:
        raise ValueError(
            'Dataset does not have a column named "timestamp". You must rename your time column to "timestamp".'
        )

    if split_by_paragraph:
        print('\nSplitting by paragraphs...')
        dataset['text'], dataset['timestamp'] = split_by_paragraph(
            dataset['text'].values, dataset['timestamp'].values)

    dataset['raw_index'] = dataset.index.values
    init_n_obs = dataset.shape[0]
    print('Starting number of observations : {}'.format(init_n_obs))

    ##Dropping NAN
    dataset.dropna(subset=['text', 'timestamp'], inplace=True)
    no_na_n_obs = dataset.shape[0]
    print(
        'Number of observations after deleting missing values : {} = {} missing values'
        .format(no_na_n_obs, init_n_obs - no_na_n_obs))

    #Dropping errors on date
    dataset = handle_errors(dataset, no_na_n_obs)
    final_n_obs = dataset.shape[0]
    print(
        'Final number of observations after handling errors on date : {} = {} errors on date'
        .format(final_n_obs, no_na_n_obs - final_n_obs))
    print('Deleted a total of {} observations.'.format(init_n_obs -
                                                       final_n_obs))

    dataset['timestamp'] = date_conversion(dataset)

    dataset.sort_values('timestamp', inplace=True)
    dataset.reset_index(drop=True, inplace=True)
    dataset['index'] = dataset.index.values

    docs = dataset['text']

    docs = docs.str.lower()
    docs = docs.apply(lambda x: unidecode.unidecode(x))

    print('\nTokenizing...')
    docs = tokenize(docs)

    if basic_word_analysis:
        print(
            '\nBasic word analysis enabled. It will take more time to compute...\n'
        )

        if enable_bigram:
            print('\nAdding bigrams...')
            before_vocab = len(Dictionary(docs))
            docs = add_bigram(docs, min_bigram_count)
            bigram_vocab = len(Dictionary(docs))
            print('\nFound {} bigrams in text\n'.format(bigram_vocab -
                                                        before_vocab))

        len_starting_vocab = len(Dictionary(docs))
        print('\nBeginning dictionary contains : {} words\n'.format(
            len_starting_vocab))

        print('\nRemoving stopwords...')
        docs = remove_stop_words(docs, language)
        curr_len_vocab = len(Dictionary(docs))
        len_rm_words = len_starting_vocab - curr_len_vocab
        len_vocab = curr_len_vocab
        freq = round(len_rm_words / len_starting_vocab, 3) * 100
        print(
            '\tRemoved {} stopwords from dictionary. It represents {}% of total words in starting vocabulary'
            .format(len_rm_words, freq))
        print('\tCurrent length of the vocabulary:', len_vocab)

        print('\nRemoving unique numbers (not words that contain numbers)...')
        docs = remove_numbers(docs)
        curr_len_vocab = len(Dictionary(docs))
        len_rm_words = len_vocab - curr_len_vocab
        len_vocab = curr_len_vocab
        freq = round(len_rm_words / len_starting_vocab, 3) * 100
        print(
            '\tRemoved {} numeric words from dictionary. It represents {}% of total words in starting vocabulary'
            .format(len_rm_words, freq))
        print('\tCurrent length of the vocabulary:', len_vocab)

        print('\nRemoving words that contain only one character...')
        docs = remove_word_with_length(docs, length=1)
        curr_len_vocab = len(Dictionary(docs))
        len_rm_words = len_vocab - curr_len_vocab
        len_vocab = curr_len_vocab
        freq = round(len_rm_words / len_starting_vocab, 3) * 100
        print(
            '\tRemoved {} one length characters from dictionary. It represents {}% of total words in starting vocabulary'
            .format(len_rm_words, freq))
        print('\tCurrent length of the vocabulary:', len_vocab)

        print('-' * 100)
        len_rm_words = len_starting_vocab - len_vocab
        freq = round(len_rm_words / len_starting_vocab, 3) * 100
        print(
            '\nRemoved {} total words from beginning dictionary. It represents {}% of total words in starting vocabulary\n'
            .format(len_rm_words, freq))
        print('-' * 100)

    else:
        print('\nWord analysis disabled')

        if enable_bigram:
            docs = add_bigram(docs, min_bigram_count)

        print('\nRemoving stopwords...')
        docs = remove_stop_words(docs, language)

        print('\nRemoving unique numbers (not words that contain numbers)...')
        docs = remove_numbers(docs)

        print('\nRemoving words that contain only one character...')
        docs = remove_word_with_length(docs, length=1)

    if lemmatizing:
        print('\nLemmatizing...')
        docs = lemmatize(docs)

    dataset['text'] = docs

    dictionary = Dictionary(dataset['text'])

    bef = len(dictionary)
    print('\nFiltering extremes...')
    dictionary.filter_extremes(no_below=extreme_no_below,
                               no_above=extreme_no_above)
    if basic_word_analysis:
        print('\n')
        print('-' * 100)
        if (extreme_no_above != 1) or (extreme_no_below != 1):
            if extreme_no_below > 1:
                extreme_no_below_str = str(
                    extreme_no_below) + ' ' + 'documents'
            else:
                extreme_no_below_str = str(
                    extreme_no_below * 100) + '%' + ' ' + 'documents'
            if extreme_no_above > 1:
                extreme_no_above_str = str(
                    extreme_no_above) + ' ' + 'documents'
            else:
                extreme_no_above_str = str(
                    extreme_no_above * 100) + '%' + ' ' + 'documents'
        print(
            '\nKeeping words in no less than {} & in no more than {}:'.format(
                extreme_no_below_str, extreme_no_above_str))
        print(
            'Number of unique tokens reduced from {} to {}, representing {} % of total vocabulary.'
            .format(bef, len(dictionary),
                    np.round(((bef - len(dictionary)) / bef) * 100, 3)))

    dataset['text'] = dataset['text'].apply(
        lambda x: [w for w in x if w in list(dictionary.token2id)])

    print('\nRemoving words that contain only one character...')
    dataset['text'] = remove_word_with_length(dataset['text'], length=1)

    print('\nDeleting rows that do not contain any text...')
    dataset = remove_empty_docs(dataset)
    print('\tDeleted {} rows because of no text'.format(final_n_obs -
                                                        dataset.shape[0]))

    print('\nNumber of unique tokens: %d' % len(dictionary))
    print('\nNumber of documents: %d \n' % len(dataset))

    print('\nPreprocessing timestamps...')
    n_years = int(str(dataset['timestamp'].iloc[-1]).split('-')[0]) - int(
        str(dataset['timestamp'].iloc[0]).split('-')[0])
    n_months = int(str(dataset['timestamp'].iloc[-1]).split('-')[1]) - int(
        str(dataset['timestamp'].iloc[0]).split('-')[1])

    dataset, date_range = timestamps_preprocessing(dataset, n_years, n_months,
                                                   temporality)

    date_range = [str(i).split(' ')[0] for i in date_range]

    for ind in range(len(date_range) - 1):
        print('Timeslice {} date range : from {} to {}'.format(
            ind, date_range[ind], date_range[ind + 1]))
    for subsample in dataset.groupby('timeslice'):
        print('Number of observations for timeslice {} : {}'.format(
            subsample[0], subsample[1].shape[0]))
    print('-' * 100)

    mapper_date = dict(zip([i for i in range(len(date_range))], date_range))

    dataset['text'] = dataset['text'].apply(lambda x: ' '.join(x))
    good_idx = []
    for idx in range(dataset.shape[0]):
        if dataset['text'].iloc[idx] != '':
            good_idx.append(idx)
    dataset = dataset.iloc[good_idx]

    print('\nBuilding file for fasttext training....')
    text_for_embeddings = list(dataset['text'])
    with open(path_to_texts_for_embedding, 'w') as f:
        for text in text_for_embeddings:
            f.write(text + '\n')

    print('Final data shape : {}'.format(dataset.shape))
    print('\nDone in {} minutes'.format(int((time() - t0) / 60)))

    return dict(dataset_preprocessed=dataset,
                dictionary=dictionary,
                vocab_size=len(dictionary),
                date_range=date_range)
Example #37
0
module_path = os.path.dirname(
    __file__
)  # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.WARNING)

# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [['human', 'interface', 'computer'],
         ['survey', 'user', 'computer', 'system', 'response', 'time'],
         ['eps', 'user', 'interface', 'system'],
         ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'],
         ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'],
         ['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


def testfile():
    # temporary data will be stored to this file
    return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')


class TestLsiModel(unittest.TestCase):
    def setUp(self):
        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))

    def testTransform(self):
        # create the transformation model
        model = lsimodel.LsiModel(self.corpus, numTopics=2)
Example #38
0
import json
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim import matutils


def vectorize(docs, vocab_size):
    '''
    docs :: iterable of iterable of (int, number)
    vocab_size :: 词表大小
    '''
    return matutils.corpus2dense(docs, vocab_size)


if __name__ == '__main__':
    with open('finance_news_test.json', encoding='utf-8') as f:
        data = json.load(f)
        data = [doc.split() for doc in data]

    dct = Dictionary.load('news.dict')
    corpus = [dct.doc2bow(doc) for doc in data]
    model = TfidfModel.load('news_tfidf.model')
    vocab_size = len(dct.token2id)

    for doc in corpus:
        # print(model[doc],len(model[doc]),len(vectorize([model[doc]],vocab_size)),len(vectorize([model[doc]],vocab_size)[0]))
        # break
        print(vectorize([model[doc]], vocab_size))
Example #39
0
    def recommend(self,s_title):
        
        docs = self.df['text'].copy()

        # Split the documents into tokens.
        tokenizer = RegexpTokenizer(r'\w+')
        for idx in range(len(docs)):
            docs[idx] = docs[idx].lower()  # Convert to lowercase.
            docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

        # Remove words that are only one character.
        docs = [[token for token in doc if len(token) > 1] for doc in docs]
        
        # Compute bigrams.
        from gensim.models import Phrases

        # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
        bigram = Phrases(docs, min_count=5, threshold=10)
        for idx in range(len(docs)):
            for token in bigram[docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)
                    
        from gensim.corpora import Dictionary

        # Create a dictionary representation of the documents.
        dictionary = Dictionary(docs)

        # Filter out words that occur less than 20 documents, or more than 50% of the documents.
        dictionary.filter_extremes(no_below=20, no_above=0.5)
        
        corpus = [dictionary.doc2bow(doc) for doc in docs]
        
        # Train LDA model.
        from gensim.models import LdaModel, LdaMulticore

        # Set training parameters.
        num_topics = 15
        chunksize = 2000
        passes = 20
        iterations = 100
        eval_every = None  # Don't evaluate model perplexity, takes too much time.

        # Make a index to word dictionary.
        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token
        
        from gensim.models import CoherenceModel

        topic_size = [1,5,10,15,20,25,30,35,40]
        coherence_score = []
        print('|- Generating Model... -|')
        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.9,iterations=300)
        
        for index, row in self.df.iterrows():
            for i in range(0,num_topics):
                self.df.at[index,'topic_'+str(i)] = 0
            for t in lda_model.get_document_topics(corpus[index]):
                self.df.at[index,'topic_'+str(t[0])] = t[1]
                
        # user has watched a title
        pick = s_title

        pick_row = self.df[self.df[self.indexer].str.lower() == pick.lower()]
        pick_index = pick_row.index.values[0]
        print('|- Generating Euclidean Distances... -|')
        def Euclidean(row, n_topics):
            pick_vec = []
            row_vec = []
            for i in range(0,n_topics):
                pick_vec.append(pick_row.iloc[0]['topic_'+str(i)])
                row_vec.append(row['topic_'+str(i)])

            # Get similarity based on top k topics of picked vector
            k=10

            top_5_idx = np.argsort(pick_vec)[-k:]
            pick_vec = np.array(pick_vec)[top_5_idx]
            row_vec = np.array(row_vec)[top_5_idx]

            return np.linalg.norm(row_vec - pick_vec)

        # select nearest 10
        def getTopNByLDA(df, col, n):
            return df.sort_values(by = col).head(n)
        
        # compute lda distances
        filteredData = self.df.copy()
        for index, row in filteredData.iterrows():
            filteredData.at[index,'lda'] = Euclidean(filteredData.iloc[index], num_topics)
        print("|- Complete! Stored recommendation DataFrame under the 'recommendations' key! -|")
        filteredData = filteredData[filteredData.index != pick_index]
        
        return {
            'result': getTopNByLDA(filteredData, 'lda', self.n_recommendations)[[self.indexer,'lda']].sort_values('lda'),
            'n_recommendations': self.n_recommendations,
            'indexer': self.indexer,
            'feature_names': self.feature_names,
        }
Example #40
0
    parser.add_argument('-f',
                        '--fileTag',
                        type=str,
                        required=True,
                        dest="fileTag",
                        help='fileTag as prefix for all exported files')
    args = parser.parse_args()

    fileTag = args.fileTag
    collections = args.collections

    corpora_path = "./corpora/"

    #### Step 1, build dictionary object ####
    print("Start to build dictionary object.")
    dct = Dictionary()
    # use Timer to print elapsed time
    with Timer():
        for each_collection in collections:
            print("Reading the corpus for {}".format(each_collection))
            file_path = f"{corpora_path}{each_collection}-raw-corpus.tsv"
            for i, a_tweet in enumerate(TweetRawCorpusStream(file_path)):
                token_f = [
                    x for x in a_tweet.tokens_str.split(",") if len(x) > 1
                ]
                dct.add_documents([token_f], prune_at=None)
            sizeofCorpus = i - 1
            print(f"Totally {sizeofCorpus} tweets in {each_collection}.")
    print("Original size of vocabs: {}".format(len(dct)))
    # control the vocabulary
    dct.filter_extremes(no_below=40,
Example #41
0
class WikipediaDataSet:
    def __init__(self, src_dir_path: str, cache_dir_path: str):
        self.src_dir_path = src_dir_path
        self.cache_dir_path = cache_dir_path
        self.dictionary = Dictionary()
        self.cache_file_paths = []

        tokenizer = MeCab.Tagger('-Ochasen')
        self.load_file(tokenizer)

    @staticmethod
    def tokenize(tokenizer: MeCab.Tagger, text):
        words = []
        word_infos = tokenizer.parse(text).split('\n')[:-2]
        for word_info in word_infos:
            word_info = word_info.split('\t')
            if '名詞' in word_info[3] or '動詞' in word_info[
                    3] or '形容詞' in word_info[3]:
                words.append(word_info[2])
        return words

    @staticmethod
    def article_to_words(tokenizer: MeCab.Tagger, article: str):
        match = re.search(r'\<doc(.|\s)*?\>\n', article)
        article = article[match.end():]
        match = re.search(r'\</doc>', article)
        article = article[:match.start()]

        texts = []
        for line in article.split('\n'):
            if not line:
                continue
            texts.append(WikipediaDataSet.tokenize(tokenizer, line))

        return texts

    def load_file(self, tokenizer: MeCab.Tagger):
        os.makedirs(self.cache_dir_path, exist_ok=True)

        for subdir_name in os.listdir(self.src_dir_path):
            subdir_path = os.path.join(self.src_dir_path, subdir_name)
            file_path_to_save = os.path.join(self.cache_dir_path, subdir_name)
            if os.path.exists(file_path_to_save):
                with open(file_path_to_save, 'rb') as _:
                    texts = dill.load(_)
            else:
                texts = []
                for file_name in os.listdir(subdir_path):
                    file_path = os.path.join(subdir_path, file_name)
                    with bz2.open(file_path, 'r') as _:
                        raw_articles = _.read().decode('utf-8')

                    match = re.search(r'\<doc(.|\s)*?\</doc>\n', raw_articles)
                    while match:
                        start, end = match.span()
                        article = raw_articles[start:end]
                        texts += WikipediaDataSet.article_to_words(
                            tokenizer, article)
                        raw_articles = raw_articles[end:]
                        match = re.search(r'\<doc(.|\s)*?\</doc>\n',
                                          raw_articles)

                file_path_to_save = os.path.join(self.cache_dir_path,
                                                 subdir_name)
                with open(file_path_to_save, 'wb') as _:
                    dill.dump(texts, _)

            self.dictionary.add_documents(texts)
            self.cache_file_paths.append(file_path_to_save)

    def get_text(self):
        for file_path_to_load in np.random.permutation(self.cache_file_paths):
            with open(file_path_to_load, 'rb') as _:
                texts = dill.load(_)
            for text in np.random.permutation(texts):
                yield text

    def __len__(self) -> int:
        return self.dictionary.num_docs
Example #42
0
    """
    return os.path.join(tempfile.gettempdir(), suffix)


@contextlib.contextmanager
def temporary_file(name=""):
    """create a temporary directory and return a path to "name" in that directory

    At the end of the context, the directory is removed.

    The function doesn't create the file.
    """
    # note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory
    tmp = tempfile.mkdtemp()
    try:
        yield os.path.join(tmp, name)
    finally:
        shutil.rmtree(tmp, ignore_errors=True)


# set up vars used in testing ("Deerwester" from the web tutorial)
common_texts = [['human', 'interface', 'computer'],
                ['survey', 'user', 'computer', 'system', 'response', 'time'],
                ['eps', 'user', 'interface', 'system'],
                ['system', 'human', 'system', 'eps'],
                ['user', 'response', 'time'], ['trees'], ['graph', 'trees'],
                ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]

common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
Example #43
0
 def LDALoad(self):
     self.ldamodel = LdaModel.load("fixed_time_window_lda.model")
     self.dictionary = Dictionary.load("lda_dictionary.model")
     print(self.dictionary)
Example #44
0
def term_document_matrix(
        documents, dictionary: Dictionary) -> List[List[Tuple[int, int]]]:
    return [dictionary.doc2bow(text) for text in documents]
def train_val_test(dataset: pd.DataFrame, dictionary: Dictionary,
                   test_size: float, val_size: float) -> Dict[str, Any]:

    # Make train val test index
    num_docs = len(dataset)
    vaSize = int(np.floor(val_size * num_docs))
    tsSize = int(np.floor(test_size * num_docs))
    trSize = int(num_docs - vaSize - tsSize)
    idx_permute = np.random.permutation(num_docs).astype(int)
    print('Reading data....')

    # Make sure our text column is of type list
    dataset['text'] = dataset['text'].apply(lambda x: x.split(' '))
    word2id = dict([(w, j) for j, w in dictionary.items()])
    id2word = dict([(j, w) for j, w in dictionary.items()])

    # Remove words not in train_data
    print('Starting vocabulary : {}'.format(len(dictionary)))

    vocab = list(dictionary)

    docs_tr = [[
        word2id[w] for w in dataset['text'][idx_permute[idx_d]] if w in word2id
    ] for idx_d in range(trSize)]
    timestamps_tr = pd.DataFrame(
        dataset['timeslice'][idx_permute[range(trSize)]])
    idx_tr = idx_permute[range(trSize)]

    docs_ts = [[
        word2id[w] for w in dataset['text'][idx_permute[idx_d + trSize]]
        if w in word2id
    ] for idx_d in range(tsSize)]
    timestamps_ts = pd.DataFrame(dataset['timeslice'][idx_permute[range(
        trSize, trSize + tsSize)]])
    idx_ts = idx_permute[range(trSize, trSize + tsSize)]

    docs_va = [[
        word2id[w]
        for w in dataset['text'][idx_permute[idx_d + trSize + tsSize]]
        if w in word2id
    ] for idx_d in range(vaSize)]
    timestamps_va = pd.DataFrame(dataset['timeslice'][idx_permute[range(
        tsSize + trSize, num_docs)]])
    idx_va = idx_permute[range(tsSize + trSize, num_docs)]

    print(
        '  Number of documents in train set : {} [this should be equal to {} and {}]'
        .format(len(docs_tr), trSize, len(timestamps_tr)))
    print(
        '  Number of documents in test set : {} [this should be equal to {} and {}]'
        .format(len(docs_ts), tsSize, len(timestamps_ts)))
    print(
        '  Number of documents in validation set: {} [this should be equal to {} and {}]'
        .format(len(docs_va), vaSize, len(timestamps_va)))

    # Split test set in 2 halves, the first containing the first half of the words in documents, and second part the second
    # half of words in documents. Will be use to gather test completion perplexity.

    print('Splitting test documents in 2 halves...')
    docs_ts_h1 = [[w for i, w in enumerate(doc) if i <= len(doc) / 2.0 - 1]
                  for doc in docs_ts]
    docs_ts_h2 = [[w for i, w in enumerate(doc) if i > len(doc) / 2.0 - 1]
                  for doc in docs_ts]

    print('Creating lists of words...')

    words_tr = create_list_words(docs_tr)
    words_ts = create_list_words(docs_ts)
    words_ts_h1 = create_list_words(docs_ts_h1)
    words_ts_h2 = create_list_words(docs_ts_h2)
    words_va = create_list_words(docs_va)

    print('  Total number of words used in train set : ', len(words_tr))
    print('  Total number of words used in test set : ', len(words_ts))
    print(
        '  Total number of words used in test firt set (first half of documents words): ',
        len(words_ts_h1))
    print(
        '  Total number of words used in test firt set (first half of documents words): ',
        len(words_ts_h2))
    print('  Total number of words used in val set : ', len(words_va))

    n_docs_tr = len(docs_tr)
    n_docs_ts = len(docs_ts)
    n_docs_ts_h1 = len(docs_ts_h1)
    n_docs_ts_h2 = len(docs_ts_h2)
    n_docs_va = len(docs_va)

    # Get doc indices
    print('Getting doc indices...')

    doc_indices_tr = create_doc_indices(docs_tr)
    doc_indices_ts = create_doc_indices(docs_ts)
    doc_indices_ts_h1 = create_doc_indices(docs_ts_h1)
    doc_indices_ts_h2 = create_doc_indices(docs_ts_h2)
    doc_indices_va = create_doc_indices(docs_va)

    print('Creating bow representation...')

    bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab))
    bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab))
    bow_ts_h1 = create_bow(doc_indices_ts_h1, words_ts_h1, n_docs_ts_h1,
                           len(vocab))
    bow_ts_h2 = create_bow(doc_indices_ts_h2, words_ts_h2, n_docs_ts_h2,
                           len(vocab))
    bow_va = create_bow(doc_indices_va, words_va, n_docs_va, len(vocab))

    print(' Train bag of words shape : {}'.format(bow_tr.shape))
    print(' Test bag of words shape : {}'.format(bow_ts.shape))
    print(' Test set 1 bag of words shape : {}'.format(bow_ts_h1.shape))
    print(' Test set 2 bag of words shape : {}'.format(bow_ts_h2.shape))
    print(' Val bag of words shape : {}'.format(bow_va.shape))

    print('\nMost import words in train BOW : \n')
    print(get_most_important_words(bow_tr, id2word))
    print('\nMost import words in val BOW : \n')
    print(get_most_important_words(bow_va, id2word))
    print('\nMost import words in test BOW : \n')
    print(get_most_important_words(bow_ts, id2word))
    print('\nDone splitting data.')

    return dict(BOW_train=bow_tr,
                BOW_test=bow_ts,
                BOW_test_h1=bow_ts_h1,
                BOW_test_h2=bow_ts_h2,
                BOW_val=bow_va,
                timestamps_train=timestamps_tr,
                timestamps_test=timestamps_ts,
                timestamps_val=timestamps_va,
                train_vocab_size=len(vocab),
                train_num_times=len(np.unique(timestamps_tr['timeslice'])),
                idx_train=idx_tr,
                idx_test=idx_ts,
                idx_val=idx_va)
Example #46
0
 def __init__(self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False):
     super().__init__(cleanup_urls=cleanup_urls,
                      nltk_tokenizer=nltk_tokenizer)
     self.dictionary = Dictionary(self.corpus)
     self.tfidf = TfidfModel(dictionary=self.dictionary)
        for word_document in tqdm(word_documents):
            sws = [sp.tokenize(word) for word in word_document]
            sw_documents.append(list(chain.from_iterable(sws)))
    if os.path.exists(config_dic.get("cache_dir")):
        print(
            f"Write Cache data. {os.path.join(config_dic.get('cache_dir'), config_dic.get('train_name') + '.sw_documents')}"
        )
        with open(
                os.path.join(config_dic.get("cache_dir"),
                             config_dic.get('train_name') + ".sw_documents"),
                "wb") as f:
            f.write(cloudpickle.dumps(sw_documents))

    print("=========== Build vocabulary ===========")
    special_token_dict = {PADDING: 0, UNKNOWN: 1}
    word_dic = Dictionary(word_documents)
    word_dic.filter_extremes(no_below=10, no_above=1.0)
    word_dic.patch_with_special_tokens(special_token_dict)
    sw_dic = Dictionary(sw_documents)
    sw_dic.filter_extremes(no_below=5, no_above=1.0)
    sw_dic.patch_with_special_tokens(special_token_dict)
    char_documents = [[[char for char in word] for word in document]
                      for document in word_documents]  # Document数 x 文字数
    char_dic = Dictionary(list(chain.from_iterable(char_documents)))
    char_dic.patch_with_special_tokens(special_token_dict)

    word_dic.save(
        os.path.join(config_dic.get("vocab_dir"),
                     f"{config_dic.get('train_name')}.word.dic"))
    char_dic.save(
        os.path.join(config_dic.get("vocab_dir"),
Example #48
0
class Word2VecWmdRelaxSimilarity(Word2VecSimilarityBase):
    def __init__(self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False):
        super().__init__(cleanup_urls=cleanup_urls,
                         nltk_tokenizer=nltk_tokenizer)
        self.dictionary = Dictionary(self.corpus)
        self.tfidf = TfidfModel(dictionary=self.dictionary)

    def get_similar_bugs(self, query):

        query = self.text_preprocess(self.get_text(query))
        words = [
            word for word in set(chain(query, *self.corpus))
            if word in self.w2vmodel.wv
        ]
        indices, words = zip(*sorted(((index, word) for (
            index, _), word in zip(self.dictionary.doc2bow(words), words))))
        query = dict(self.tfidf[self.dictionary.doc2bow(query)])
        query = [(new_index, query[dict_index])
                 for new_index, dict_index in enumerate(indices)
                 if dict_index in query]
        documents = [
            dict(self.tfidf[self.dictionary.doc2bow(document)])
            for document in self.corpus
        ]
        documents = [[(new_index, document[dict_index])
                      for new_index, dict_index in enumerate(indices)
                      if dict_index in document] for document in documents]
        embeddings = np.array([self.w2vmodel.wv[word] for word in words],
                              dtype=np.float32)
        nbow = dict(((index, list(chain([None], zip(*document))))
                     for index, document in enumerate(documents)
                     if document != []))
        nbow["query"] = tuple([None] + list(zip(*query)))
        distances = WMD(embeddings, nbow,
                        vocabulary_min=1).nearest_neighbors("query")

        return [
            self.bug_ids[distance[0]] for distance in distances
            if self.bug_ids[distance[0]] != query["id"]
        ]

    def get_distance(self, query1, query2):
        query1 = self.text_preprocess(self.get_text(query1))
        query2 = self.text_preprocess(self.get_text(query2))

        words = [
            word for word in set(chain(query1, query2, *self.corpus))
            if word in self.w2vmodel.wv
        ]
        indices, words = zip(*sorted(((index, word) for (
            index, _), word in zip(self.dictionary.doc2bow(words), words))))
        query1 = dict(self.tfidf[self.dictionary.doc2bow(query1)])
        query2 = dict(self.tfidf[self.dictionary.doc2bow(query2)])

        query1 = [(new_index, query1[dict_index])
                  for new_index, dict_index in enumerate(indices)
                  if dict_index in query1]
        query2 = [(new_index, query2[dict_index])
                  for new_index, dict_index in enumerate(indices)
                  if dict_index in query2]
        embeddings = np.array([self.w2vmodel.wv[word] for word in words],
                              dtype=np.float32)
        nbow = {}
        nbow["query1"] = tuple([None] + list(zip(*query1)))
        nbow["query2"] = tuple([None] + list(zip(*query2)))
        distances = WMD(embeddings, nbow,
                        vocabulary_min=1).nearest_neighbors("query1")

        return distances[0][1]
i=0
for x in mycol.find():
	document=(x['text']).lower()
	temp=process_text(document)
	
	for word in temp:
		docmap[word]=i

		
	i=i+1
	train_text.append(temp)

print("no of entries in train text is %d"%len(train_text))

dictionary = Dictionary(train_text)

corpus = [dictionary.doc2bow(text) for text in train_text]


hdpmodel1 = HdpModel(corpus=corpus, id2word=dictionary)


x=hdpmodel1.show_topics(num_topics=30,num_words=200)


twords={}
for topic,word in x:
	twords[str(topic)]=(re.sub('[^A-Za-z ]+', '', word)).split()

stemmer = PorterStemmer()
translate_tab = {ord(p): u" " for p in punctuation}

def text2tokens(raw_text):
    """Split the raw_text string into a list of stemmed tokens."""
    clean_text = raw_text.lower().translate(translate_tab)
    tokens = [token.strip() for token in tokenizer.tokenize(clean_text)]
    tokens = [token for token in tokens if token not in eng_stopwords]
    #stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return [token for token in tokens if len(token) > 2]  # skip short tokens

dataset = [text2tokens(txt) for txt in newsgroups['data'] if len(text2tokens(txt))>0]  # convert a documents to list of tokens
targets = [newsgroups['target'][i] for i,txt in enumerate(newsgroups['data']) if len(text2tokens(txt))>0]

from gensim.corpora import Dictionary
dictionary = Dictionary(documents=dataset, prune_at=None)
dictionary.filter_extremes(no_below=5, no_above=0.6, keep_n=None)  # use Dictionary to remove un-relevant tokens
dictionary.compactify()

vocab = dictionary.token2id

print("Newsgroup loaded")

print("Downloading fasttext")
fasttext_vectors = gensim.downloader.load('fasttext-wiki-news-subwords-300')
print("Fasttext downloaded")

embeddings = np.zeros((len(dictionary), 300))
for w,i in dictionary.token2id.items():
    try:
        embeddings[i] = fasttext_vectors.wv[w]
Example #51
0
import csv
from gensim.corpora import Dictionary

from seq2seq.seq2seq import Encoder, Decoder

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

corpus = []
with open('dataset.csv', encoding='utf-8') as fp:
    reader = csv.reader(fp)
    for i, row in enumerate(reader):
        if i == 0: pass
        corpus.append(row[0].split(' '))
        corpus.append(row[1].split(' '))
N = len(corpus) // 2
dct = Dictionary(corpus)
word2id = dct.token2id
initialize = dct[0]
dct_len = len(word2id)
word2id.update({"<pad>": dct_len, "<eos>": dct_len + 1})
id2word = {v: k for k, v in word2id.items()}

seq_len = 10


def load_dataset():
    def load_sent_list(training=True):
        sent_list = []
        with open('dataset.csv', encoding='utf-8') as fp:
            reader = csv.reader(fp)
            for i, row in enumerate(reader):
Example #52
0
from gensim.corpora import Dictionary
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import TweetTokenizer

#############
# Fonctions #
#############

# élimine les "stop words"
stop_words = stopwords.words('english') + list(punctuation) + [
    'nt', 's'
]  # + remove stuff like n't, 's, ...
# chargement du dictionnaire et du modèle tf-idf
dictionary = Dictionary.load("ressources/dictionary")
tfidf_model = TfidfModel.load("ressources/tfidf_model")


#identifie si une chaîne de caractères est alphanumérique
def is_alpha(string):
    regex = re.compile('[^a-zA-Z]')
    return bool(regex.sub('', string))


#identifie si une chaîne de caractères est une URL
def tokenize(text):
    words = word_tokenize(text)
    words = [
        re.sub('[\']', '', w.lower()) for w in words
        if not re.match('//*/*', w)
Example #53
0
class LSISimilarity(BaseSimilarity):
    def __init__(self,
                 cleanup_urls=True,
                 nltk_tokenizer=False,
                 confidence_threshold=0.8):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.corpus = []

        for bug in bugzilla.get_bugs():

            textual_features = self.text_preprocess(self.get_text(bug))
            self.corpus.append([bug["id"], textual_features])

        # Assigning unique integer ids to all words
        self.dictionary = Dictionary(text for bug_id, text in self.corpus)

        # Conversion to BoW
        corpus_final = [
            self.dictionary.doc2bow(text) for bug_id, text in self.corpus
        ]

        # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
        tfidf = models.TfidfModel(corpus_final)
        corpus_tfidf = tfidf[corpus_final]

        # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
        self.lsi = models.LsiModel(corpus_tfidf,
                                   id2word=self.dictionary,
                                   num_topics=300)
        corpus_lsi = self.lsi[corpus_tfidf]

        # Indexing the corpus
        self.index = similarities.Similarity(output_prefix="simdata.shdat",
                                             corpus=corpus_lsi,
                                             num_features=300)

    def search_similar_bugs(self, query, k=10):
        query_summary = "{} {}".format(query["summary"],
                                       query["comments"][0]["text"])
        query_summary = self.text_preprocess(query_summary)

        # Transforming the query to latent 300-D space
        vec_bow = self.dictionary.doc2bow(query_summary)
        vec_lsi = self.lsi[vec_bow]

        # Perform a similarity query against the corpus
        sims = self.index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        # Get IDs of the k most similar bugs
        return [
            self.corpus[j[0]][0] for j in sims[:k]
            if self.corpus[j[0]][0] != query["id"]
        ]

    def get_distance(self, query1, query2):
        raise NotImplementedError
Example #54
0
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Load a potentially pretrained model from disk.
lda_model =  models.LdaModel.load('lda_model')

# Load previous dictionary
id2word = Dictionary.load_from_text('/Users/hellofutrue/Desktop/Insight/Python/Feb/dictionary')

posts_influencers = pd.read_csv('/Users/hellofutrue/Desktop/Insight/Python/Feb/files/posts_influencers.csv')
posts_influencers = posts_influencers.rename(index=str, columns={'Unnamed: 0': "people", '0': 'content'})
data = posts_influencers.content.values.tolist()

def preprocessing(dat):
    # Tokenization
    def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
    
    data_words = list(sent_to_words(dat))
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
Example #55
0
        R_all+=R
        all_count+=1
    P_all/=all_count
    R_all/=all_count
    try:
        F1=2*P_all*R_all/(P_all+R_all)
    except:
        F1=0.0
    return P_all,R_all,F1

f=open(homedir+"/results/ontology/c2n.json",'r')
c2n=json.load(f)
f.close()
prefix='http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#'
ncit_dict=[k.split('#')[1] for k in c2n.keys()]
dictionary=Dictionary([ncit_dict]);dictionary[0]

model_name="MLPsparse_1hidden"
model=get_model_S3(model_name)

topic_num=[5,10,20,25,40,50,100,200,250]

for tn in topic_num:
	lda=AuthorTopicModel.load(homedir+"/results/models/lda2000_topic"+str(tn))
	threshold=0.0
	volume=100
	while threshold<1.0:
		alpha=0.0
		while alpha<1.0:
			P,R,F=test_on_doc_S3_atmodel(lda,model,volume,alpha,threshold)
			f=open(homedir+"/results/logs/lda_eval_topic"+str(tn),'a')
Example #56
0
import sys

from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
from gensim.models import word2vec
from sklearn import decomposition

data_file = sys.argv[1]
pca_num = int(sys.argv[2])

sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2]

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences]

x = corpus2dense(corpus, len(dic)).T

pca = decomposition.PCA(n_components = pca_num, random_state = 1)

nx = pca.fit_transform(x)

print(sum(pca.explained_variance_ratio_))
Example #57
0
class gensim_data(object):
    def __init__(self,mashup_descriptions, api_descriptions, mashup_categories=None, api_categories=None,tag_times=2,mashup_only=False,strict_train=False):
        self.mashup_only =mashup_only
        self.strict_train = strict_train
        # 整合text和tag信息:一个mashup/api的信息整合在一起,一行
        if tag_times>0 and mashup_categories is not None:
            assert len(mashup_descriptions)==len(mashup_categories)
            self.mashup_dow=[[]]*len(mashup_descriptions)
            for i in range(len(mashup_descriptions)):
                self.mashup_dow[i]=mashup_descriptions[i]
                for j in range(tag_times):
                    self.mashup_dow[i] += mashup_categories[i] #  直接将文本和tag拼接,是否有更好的方法?增加出现次数?
        else:
            self.mashup_dow = mashup_descriptions
        self.mashup_dow = [[str (index) for index in indexes] for indexes in self.mashup_dow] # 二维列表
        # print (self.mashup_dow[0])

        if tag_times>0 and api_categories is not None:
            assert len (api_descriptions) == len (api_categories)
            self.api_dow=[[]]*len(api_descriptions)
            for i in range(len(api_descriptions)):
                self.api_dow[i]=api_descriptions[i]
                for j in range(tag_times):
                    self.api_dow[i]+=api_categories[i]
        else:
            self.api_dow=api_descriptions
        self.api_dow = [[str (index) for index in indexes] for indexes in self.api_dow]

        if not self.mashup_only and not self.strict_train:
            self.dct = Dictionary(self.mashup_dow + self.api_dow)
        if self.mashup_only and self.strict_train:
            # 训练用的mashup,api的编码
            self.train_mashup_dow = [self.mashup_dow[m_id] for m_id in dataset.crt_ds.his_mashup_ids]
            self.dct = Dictionary(self.train_mashup_dow)
            self.train_mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.train_mashup_dow]  # 词id-数目
        # 无论怎样,总要为每个mashup/api计算feature
        self.mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.mashup_dow]  # 所有mashup文本的词id-数目
        print('self.mashup_dow, num:',len(self.mashup_dow))
        zero_num = sum([1 if len(mashup_info)==0 else 0 for mashup_info in self.mashup_dow])
        print('zero_num',zero_num)
        self.api_dow = [self.dct.doc2bow(api_info) for api_info in self.api_dow]

        # print('len of self.mashup_dow,self.api_dow:{},{}'.format(len(self.mashup_dow),len (self.api_dow)))

        self.num_topics =0
        self.model = None # 处理文本的模型
        self._mashup_features= None # 文本提取的特征向量
        self._api_features= None

        self.mashup_topics = None # 文本最高的N个topic
        self.api_topics = None

    # 只关注词在文本中是否出现过,二进制,用于计算cos和jaccard
    def get_binary_v(self):
        dict_size=len(self.dct)
        mashup_binary_matrix=np.zeros((meta_data.mashup_num,dict_size))
        api_binary_matrix = np.zeros ((meta_data.api_num, dict_size))
        mashup_words_list=[] # 每个mashup中所有出现过的词
        api_words_list = []
        for i in range(meta_data.mashup_num):
            temp_words_list,_=zip(*self.mashup_dow[i])
            mashup_words_list.append(temp_words_list)
            for j in temp_words_list:# 出现的词汇index
                mashup_binary_matrix[i][j]=1.0

        for i in range(meta_data.api_num):
            temp_words_list,_=zip(*self.api_dow[i])
            api_words_list.append(temp_words_list)
            for j in temp_words_list:# 出现的词汇index
                api_binary_matrix[i][j]=1.0
        return mashup_binary_matrix,api_binary_matrix,mashup_words_list,api_words_list

    def model_pcs(self,model_name,LDA_topic_num=None):
        # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)]
        if self.mashup_only:
            if self.strict_train:
                train_corpus = self.train_mashup_dow
            else:
                train_corpus = self.mashup_dow
        else:
            if self.strict_train:
                train_corpus = self.train_mashup_dow + self.train_api_dow
            else:
                train_corpus = self.mashup_dow + self.api_dow

        if model_name=='HDP':
            self.model = HdpModel(train_corpus, self.dct)
            self.num_topics = self.model.get_topics ().shape[0]
            print('num_topics',self.num_topics)
        elif model_name=='TF_IDF':
            self.model =TfidfModel (train_corpus)
            self.num_topics=len(self.dct)
        elif model_name=='LDA':
            if LDA_topic_num is None:
                self.model = LdaModel(train_corpus)
            else:
                self.model = LdaModel(train_corpus,num_topics=LDA_topic_num)
            self.num_topics = self.model.get_topics ().shape[0]

        else:
            raise ValueError('wrong gensim_model name!')

        # 使用模型处理文本,再转化为标准的np格式(每个topic上都有上)
        # print(self.mashup_dow)
        self.mashup_features=[self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature
        # print(self.mashup_features)
        print('self.mashup_features, num:', len(self.mashup_features))
        zero_num1 = sum([1 if len(mashup_feature)==0 else 0 for mashup_feature in self.mashup_features])
        print('zero_num1',zero_num1)
        for i in range(len(self.mashup_features)):
            if len(self.mashup_features[i])==0:
                print(self.mashup_dow[i])

        self.api_features = [self.model[api_info] for api_info in self.api_dow]
        # print('when model-pcs,len of mashup_features and api_features:{},{}'.format(len(mashup_features),len(api_features)))
        self._mashup_features=np.zeros((meta_data.mashup_num, self.num_topics))
        self._api_features = np.zeros((meta_data.api_num, self.num_topics))
        for i in range(meta_data.mashup_num): # 部分维度有值,需要转化成规范array
            for index,value in self.mashup_features[i]:
                self._mashup_features[i][index]=value
        for i in range(meta_data.api_num):
            for index,value in self.api_features[i]:
                self._api_features[i][index]=value
        return self._mashup_features, self._api_features

    def get_topTopics(self,topTopicNum=3):# 选取概率最高的topK个主题 [(),(),...]
        mashup_topics = []
        api_topics = []
        for index in range(meta_data.mashup_num):
            sorted_mashup_feature = sorted(self.mashup_features[index],key = lambda x:x[1],reverse=True)
            try:
                topic_indexes,_ = zip(*sorted_mashup_feature)
            except:
                # 有时mashup_bow非空,但是mashup_feature为空
                topic_indexes = random.sample(range(meta_data.mashup_num),topTopicNum)
                # print(self.mashup_dow[index])
                # print(self.mashup_features[index])
                # print(sorted_mashup_feature)
                # raise ValueError('wrong 138!')
            num = min(len(topic_indexes),topTopicNum)
            mashup_topics.append(topic_indexes[:num])
        for index in range(meta_data.api_num):
            sorted_api_feature = sorted(self.api_features[index], key=lambda x: x[1], reverse=True)
            try:
                topic_indexes,_ = zip(*sorted_api_feature)
            except:
                topic_indexes = random.sample(range(meta_data.api_num), topTopicNum)
            num = min(len(topic_indexes),topTopicNum)
            api_topics.append(topic_indexes[:num])
        return mashup_topics,api_topics
Example #58
0
class LDAForEvent:
    height_weight = 8  # the weight of height in Manhattan distance
    delete_weight = 10  # the weight of delete a character when matching
    add_weight = 10  # the weight of add a character when matching
    ldamodel = LdaModel
    dictionary = corpora.Dictionary
    temp_dic = []
    dictionary = Dictionary.load("lda_dictionary.model")
    # load raw data into workspace
    @staticmethod
    def read_excel(file):
        time_window = 40 * 1000
        data = xlrd.open_workbook(file)
        table = data.sheets()[0]

        start = 0  # 开始的行
        # end = 164  # 结束的行
        end = len(table.col_values(0))  # from the first line to last line
        rows = end - start
        list_values = ""
        flag = 0
        word_brffer = ''
        start_time = float(table.row_values(0)[0])
        end_time = start_time + time_window
        x = start
        while x < end:
            # for x in range(start, end):
            row = table.row_values(x)

            flag += 1
            temp_s_time = float(row[0])
            temp_e_time = float(row[1])
            temp_char = row[2]
            # handle time
            if temp_e_time < end_time:  # the event is in the current time window
                word_brffer += temp_char
            elif temp_s_time > end_time:  # the event is out of the window
                start_time = end_time
                end_time = start_time + time_window
                list_values += word_brffer + ' '
                word_brffer = ""
                x -= 1
            elif temp_s_time < end_time < temp_e_time:
                start_time = temp_e_time + 1
                end_time = start_time + time_window
                list_values += word_brffer + temp_char + ' '
                word_brffer = ''
            x += 1

        # print([list_values])
        # datamatrix = np.array(list_values)
        # print(datamatrix)
        return list_values

    # used to calculate the distance between two character
    # second vision
    # using characters a-z A-Z
    def calcDis(self, char_1, char_2):

        if ord(char_1) > 140:
            height1 = (ord(char_1) - ord('a')) % 7
            width1 = (ord(char_1) - ord('a')) / 7
        else:
            height1 = (ord(char_1) - ord('C')) % 7
            width1 = (ord(char_1) - ord('C')) / 7
        if ord(char_2) > 140:
            height2 = (ord(char_2) - ord('a')) % 7
            width2 = (ord(char_2) - ord('a')) / 7
        else:
            height2 = (ord(char_2) - ord('C')) % 7
            width2 = (ord(char_2) - ord('C')) / 7
        partA = self.height_weight * abs(height1 - height2)
        partB = abs(width1 - width2)
        return partA + partB

    #   fuzzyEvent2
    #   used to match words with different length
    def fuzzyEvent(self, s1, s2):
        match_matrix = [[0 for i in range(len(s2) + 1)]
                        for i in range(len(s1) + 1)
                        ]  # length of s1 is numbers of rows; s2 are columns

        for i in range(len(s1)):
            match_matrix[i][0] = self.add_weight * i
        for j in range(len(s2)):
            match_matrix[0][j] = self.delete_weight * j
        for i in range(1, len(s1) + 1):
            for j in range(1, len(s2) + 1):
                match_matrix[i][j] = min(
                    match_matrix[i - 1][j - 1] +
                    self.calcDis(self, s1[i - 1], s2[j - 1]),
                    match_matrix[i - 1][j] + self.add_weight,
                    match_matrix[i][j - 1] + self.delete_weight)

        sum_distance = match_matrix[len(s1)][len(s2)]
        max_unit = 100  # should change while the add_weight and delete_weight changed
        return (max_unit - sum_distance) / max_unit

    # doc is test document
    # dic is the dictionary of lda model
    def testEvent(self, doc, dic=[]):
        if len(dic) == 0:
            dic = self.dictionary
        testV = []
        for i in range(len(dic)):
            temp = [i, 0]
            testV.append(temp)
        for word in doc:
            f_max = 0
            flag = 1

            temp_testV = [0 for i in range(len(testV))]
            for index in range(len(dic)):
                if dic[index] == word:
                    testV[index][1] += 1
                    break
                if abs(len(word) - len(dic[index])) > 3:
                    continue
                grade = self.fuzzyEvent(self, word, dic[index])
                if f_max < grade:
                    f_max = grade
                    flag = 1
                elif f_max == grade:
                    flag += 1
                temp_testV[index] = grade
            for index in range(len(testV)):
                if f_max == temp_testV[index]:
                    testV[index][1] += 1 / flag
        return self.ldamodel[testV]

    def LDALoad(self):
        self.ldamodel = LdaModel.load("fixed_time_window_lda.model")
        self.dictionary = Dictionary.load("lda_dictionary.model")
        print(self.dictionary)
        # print(len(self.dictionary))

    def LDATest(self, test):
        result = self.testEvent(self, test, self.dictionary)
        return result
with open(out_f, 'wb') as pipe:
    pickle.dump(docs_phrased, pipe)

# check outcome of nlp pipeline
print('''
=============================================================================
published article:
-----------------------------------------------------------------------------
{}
=============================================================================
tokenized article:
-----------------------------------------------------------------------------
{}
=============================================================================
tri-grammed tokenized article:
-----------------------------------------------------------------------------
{}
'''.format(docs[1], docs_tokens[1], docs_phrased[1]))

# %% get corpus & dictionary to use for further nlp analysis

# get dictionary and write it to a file
pr_dictionary = Dictionary(docs_phrased)
pr_dictionary.save('.data/pr_dictionary.dict')

# get corpus and write it to a file
pr_corpus = [pr_dictionary.doc2bow(doc) for doc in docs_phrased]
out_f = os.path.join('.data', 'pr_corpus.mm')
MmCorpus.serialize(out_f, pr_corpus)
mm = MmCorpus(out_f)
Example #60
0
class LDARetrieval():

    def __init__(self, docs, get_model=False, num_topics=10, passes=6, iterations=40, prep_search=False):
        
      fDICT = "./models/lda_dict.dat"

      fCORPUS = "./models/lda_corpus.dat"
      if os.path.exists(fDICT) and os.path.exists(fCORPUS):
          print("Loading corpus from disk...")
          with open(fDICT, "rb") as fp:
              self.dictionary = pkl.load(fp)
          with open(fCORPUS, "rb") as fp:
              self.corpus = pkl.load(fp)
      else:
          print("Processing documents...")
          doclist = [docs[doc] for doc in docs]  
          self.dictionary = Dictionary(doclist)
          self.dictionary.filter_extremes(no_below=400, no_above=0.333)
          self.corpus = [self.dictionary.doc2bow(doc) for doc in doclist]
          with open(fDICT, "wb") as fp:
              pkl.dump(self.dictionary, fp)
          with open(fCORPUS, "wb") as fp:
              pkl.dump(self.corpus, fp)
      if get_model:
          self.get_model(num_topics=num_topics, passes=passes, iterations=iterations, prep_search=prep_search, docs=docs)
    
    def train(self, num_topics, chunksize=10000, passes=6, iterations=40, eval_every=40):
      fmodel = f"./models/lda_{num_topics}top_{iterations}iter_{passes}pass"
#       logging.basicConfig(filename=fmodel + ".log",
#                     format="%(asctime)s:%(levelname)s:%(message)s",
#                     level=logging.INFO)
      
      temp = self.dictionary[0] 
      id2word = self.dictionary.id2token 
      model = LdaMulticore( corpus=self.corpus,
                            id2word=id2word,
                            chunksize=chunksize,
                            iterations=iterations,
                            num_topics=num_topics,
                            passes=passes,
                            eval_every=eval_every)
      model.save(fmodel + ".pt")
      self.model = model

#       p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity")
#       matches = [p.findall(l) for l in open(fmodel+'.log')]
#       matches = [m for m in matches if len(m) > 0]
#       tuples = [t[0] for t in matches]
#       perplexity = [float(t[1]) for t in tuples]
#       liklihood = [float(t[0]) for t in tuples]
#       iter = list(range(0,len(tuples)*10,10))
#       plt.plot(iter,liklihood,c="black")
#       plt.ylabel("log liklihood")
#       plt.xlabel("iteration")
#       plt.title("Topic Model Convergence")
#       plt.grid()
#       plt.savefig(fmodel + ".pdf") 
      
      return model

    def prepare_search(self, docs):
      fdocsearch = f"./models/docs_{self.model.num_topics}search.dat"
    
      if os.path.exists(fdocsearch):
        print("Loading docs for search from disk...")
        with open(fdocsearch, "rb") as fp:
          self.docvecs = pkl.load(fp)
      else:
        print("Preparing docs for search...")
        self.docvecs = {}
        for doc in docs:
          docvec = np.zeros(self.model.num_topics)
          doc_repr = self.dictionary.doc2bow(docs[doc])
          for i, frac in self.model[doc_repr]:
            docvec[i] = frac
          self.docvecs[doc] = docvec
        with open(fdocsearch, "wb") as fp:
          pkl.dump(self.docvecs, fp)

    def get_model(self, num_topics, passes=6, iterations=40, prep_search=False, docs=None):
      fname = f"./models/lda_{num_topics}top_{iterations}iter_{passes}pass"
      if not os.path.exists(fname + ".pt"):
        print("Model not found...")
        return None
      self.model = LdaModel.load(fname + ".pt")
      if prep_search:
        self.prepare_search(docs)
      return self.model

    def search(self, query):
        query_repr = self.dictionary.doc2bow(read_ap.process_text(query))
        qvec = np.zeros(self.model.num_topics)
        for i, frac in self.model[query_repr]:
          qvec[i] = frac

        results = {}
        for doc in self.docvecs:
          results[doc] = -kl_divergence(self.docvecs[doc], qvec)

        results = list(results.items())
        results.sort(key=lambda _: -_[1])
        return results