def __init__(self, modelpackagepath, packageFiles):
     for key in packageFiles.keys():
         packageFiles[key] = absPath(os.path.join(modelpackagepath, packageFiles[key]))
     self.packageFiles = packageFiles
     self.models = {"dictionary": None, "m1Model": None, "m1Index": None, "m2Model": None, "m2Index": None, "detectors": None, "sampleUtterances": None, "mappings": None, "modelInfo": None}
     try:
         with open(self.packageFiles["modelInfo"], "r") as fp:
             self.models["modelInfo"] = ModelInfo(json.loads(fp.read()))
             fp.close()
     except:
         self.models["modelInfo"] = ModelInfo({})
     try:
         self.models["dictionary"] = corpora.Dictionary.load(self.packageFiles["dictionaryFile"])
     except:
         raise ModelFileLoadFailed("Failed to load dictionary from file " + self.packageFiles["dictionaryFile"])
     try:
         self.models["m1Model"] = TfidfModel.load(self.packageFiles["m1ModelFile"])
     except:
         raise ModelFileLoadFailed("Failed to load model from file " + self.packageFiles["m1ModelFile"])
     try:
         self.models["m1Index"] = similarities.MatrixSimilarity.load(self.packageFiles["m1IndexFile"])
     except:
         raise ModelFileLoadFailed("Failed to load index from file " + self.packageFiles["m1IndexFile"])
     try:
         self.models["m2Model"] = TfidfModel.load(self.packageFiles["m2ModelFile"])
         self.models["m2Model"] = None
         gc.collect()
     except:
         raise ModelFileLoadFailed("Failed to load model from file " + self.packageFiles["m2ModelFile"])
     try:
         self.models["m2Index"] = similarities.MatrixSimilarity.load(self.packageFiles["m2IndexFile"])
         self.models["m2Index"] = None
         gc.collect()
     except:
         raise ModelFileLoadFailed("Failed to load index from file " + self.packageFiles["m2IndexFile"])
     try:
         with open(self.packageFiles["detectorsFile"], "r") as f:
             self.models["detectors"] = json.loads(f.read())
             f.close()
     except:
         raise ModelFileLoadFailed("Failed to parse json from file " + self.packageFiles["detectorsFile"])
     if self.models["modelInfo"].detectorContentSplitted:
         try:
             with open(self.packageFiles["mappingsFile"], "r") as f:
                 self.models["mappings"] = json.loads(f.read())
                 f.close()
         except:
             raise ModelFileLoadFailed("Failed to parse json from file " + self.packageFiles["mappingsFile"])
     try:
         with open(self.packageFiles["sampleUtterancesFile"], "r") as f:
             self.models["sampleUtterances"] = json.loads(f.read())
             f.close()
             self.models["sampleUtterances"] = None
             gc.collect()
     except:
         raise ModelFileLoadFailed("Failed to parse json from file " + self.packageFiles["sampleUtterancesFile"])
Exemple #2
0
def make_scores_for_sample():
    doc2vec_model = doc2vec.Doc2Vec.load('doc2vec_weigths')
    logging.info('doc2vec loaded')
    tfidf_unigram_model = TfidfModel.load('tfidf_unigram')
    logging.info('tfidf unigram loaded')
    tfidf_bigram_model = TfidfModel.load('tfidf_bigram')
    logging.info('tfidf bigram loaded')
    d1 = corpora.Dictionary.load('./dict_1.gensim')
    logging.info('dict1 loaded')
    d2 = corpora.Dictionary.load('./dict_2.gensim')
    logging.info('dict2 loaded')
    queries = pd.read_csv('./queries_norm.tsv', sep='\t', header=None, index_col=0)
    sample = pd.read_csv('./sample.csv', sep=',').sort_values(by=['DocumentId'])
    with open('./submission.csv', 'w') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(['QueryId', 'DocumentId', 'Score'])
        for idx, row in tqdm(sample.iterrows()):
            query_id = row['QueryId']
            doc_id = row['DocumentId']
            doc2vec_score = doc2vec_model.docvecs.similarity('DOC_%d' % doc_id, 'QUERY_%d' % query_id)
            doc = get_doc(doc_id)
            query = str(queries.loc[query_id])
            doc_title = str(doc[1])
            doc_content = str(doc[2])

            doc_title_words = doc_title.split()
            doc_content_words = doc_content.split()
            query_words = query.split()

            doc_title_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_title_words[:-1], doc_title_words[1:]))))
            doc_content_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_content_words[:-1], doc_content_words[1:]))))
            query_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(query_words[:-1], query_words[1:]))))

            doc_title_words = d1.doc2bow(doc_title_words)
            doc_content_words = d1.doc2bow(doc_content_words)
            query_words = d1.doc2bow(query_words)

            doc_title_words = tfidf_unigram_model[doc_title_words]
            doc_content_words = tfidf_unigram_model[doc_content_words]
            query_words = tfidf_unigram_model[query_words]

            doc_title_bigrams = tfidf_bigram_model[doc_title_bigrams]
            doc_content_bigrams = tfidf_bigram_model[doc_content_bigrams]
            query_bigrams = tfidf_bigram_model[query_bigrams]

            tfidf_title_score_uni = matutils.cossim(doc_title_words, query_words)
            tfidf_content_score_uni = matutils.cossim(doc_content_words, query_words)
            tfidf_title_score_bi = matutils.cossim(doc_title_bigrams, query_bigrams)
            tfidf_content_score_bi = matutils.cossim(doc_content_bigrams, query_bigrams)

            score = (2 * tfidf_content_score_bi + 2 * tfidf_title_score_uni + tfidf_content_score_uni + 0.5 * doc2vec_score) / 5.5
            writer.writerow([query_id, doc_id, score])
def make_lda_model():
    tfidf_model = TfidfModel.load((output_dir / 'tfidf_model.pkl').as_posix())
    lda_model = LdaModel(nmf_iterator(
        CONTENT_FILES, Dict.load((output_dir / 'dict.pkl').as_posix()),
        tfidf_model),
                         num_topics=TOPIC_NUM)
    lda_model.save((output_dir / 'lda_model.pkl').as_posix())
def load_tfidf_model(path):
    '''
    Loads a TF-IDF model from file
    :param path: the path
    :type path: string
    '''
    model = TfidfModel.load(path)
def tfidf_w2v_top5w(all_docs_prepro, id_dict):
    with open('../code/similarity/mappings/map_w2v_tfidf_5w.pkl', 'rb') as fp:
        Classes = pickle.load(fp)
    mapping = Classes['mapping']

    print('Loading Word2vec model')
    model_path = 'embedding/models/word2vec_all.model'
    model_w2v = Word2Vec.load(model_path)

    print('Loading Tfidf model')
    model_path = 'embedding/models/tfidf_all.model'
    model_tfidf = TfidfModel.load(model_path)

    dct = Dictionary(all_docs_prepro)
    corpus = [dct.doc2bow(line) for line in all_docs_prepro]

    mean_ticket_ques = top5_average('ticket_ques',
                                    corpus=corpus,
                                    dct=dct,
                                    model_w2v=model_w2v,
                                    model_tfidf=model_tfidf,
                                    id_dict=id_dict,
                                    all_docs_prepro=all_docs_prepro)

    return (mean_ticket_ques, mapping)
Exemple #6
0
def tfidf_w2v_top5w(all_docs_prepro, id_dict, thresh):
    print('Loading Word2vec model')
    model_path = 'embedding/models/word2vec_all.model'
    model_w2v = Word2Vec.load(model_path)

    print('Loading Tfidf model')
    model_path = 'embedding/models/tfidf_all.model'
    model_tfidf = TfidfModel.load(model_path)

    dct = Dictionary(all_docs_prepro)
    corpus = [dct.doc2bow(line) for line in all_docs_prepro]

    mean_ticket_ans = top5_average(dat='ticket_ans',
                                   corpus=corpus,
                                   dct=dct,
                                   model_w2v=model_w2v,
                                   model_tfidf=model_tfidf,
                                   id_dict=id_dict,
                                   all_docs_prepro=all_docs_prepro)
    mean_faq_ans = top5_average(dat='faq_ans',
                                corpus=corpus,
                                dct=dct,
                                model_w2v=model_w2v,
                                model_tfidf=model_tfidf,
                                id_dict=id_dict,
                                all_docs_prepro=all_docs_prepro)

    output = compute_sim(mean_ticket_ans, mean_faq_ans, thresh)

    with open("../code/similarity/mappings/map_w2v_tfidf_5w.pkl", "wb") as fp:
        pickle.dump(output, fp)
def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    parser = OptionParser()
    parser.add_option('-f', '--corpus-file')
    parser.add_option('-p', '--parse-procs', default=1, type=int)
    parser.add_option('-s', '--sublexicalize-procs', default=1, type=int)
    parser.add_option('-t', '--tfidf-model')
    parser.add_option('-v', '--vocabulary')
    parser.add_option('-m', '--model-file')
    opts, args = parser.parse_args()

    corpus_fn = opts.corpus_file or sys.exit()
    n_proc_parse = opts.parse_procs
    n_proc_sublex = opts.sublexicalize_procs
    vocab_fn = opts.vocabulary
    tfidf_fn = opts.tfidf_model
    model_fn = opts.model_file or sys.exit()

    with BZ2File(corpus_fn) as f:
        corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()),
                                      order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex,
                                      create_dictionary=False)

        if vocab_fn and os.path.exists(vocab_fn):
            logging.info("Loading vocabulary from %s" % vocab_fn)
            vocab = Dictionary.load(vocab_fn)
        else:
            logging.info("Creating vocabulary")

            start = time.clock()
            vocab = Dictionary(corpus.get_texts())
            end = time.clock()
            logging.info("Vocabulary created in %d seconds" % (end - start))

            if vocab_fn:
                logging.info("Saving dictionary to %s" % vocab_fn)
                vocab.save(vocab_fn)

        corpus.dictionary = vocab

        corpus.dictionary.filter_extremes(no_below=5, no_above=.8)
        corpus.dictionary.compactify()

        if tfidf_fn and os.path.exists(tfidf_fn):
            logging.info("Reading TF-IDF model from %s" % tfidf_fn)
            tfidf = TfidfModel.load(tfidf_fn)
        else:
            logging.info("creating TF-IDF model")
            tfidf = TfidfModel(corpus)

            if tfidf_fn:
                logging.info("Saving TFF-IDF model to %s" % tfidf_fn)
                tfidf.save(tfidf_fn)

        bow_corpus = (tfidf[art] for art in corpus)

        model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary)

        model.save(model_fn)
Exemple #8
0
def main(JDK, url, title, query):
    dictionary = corpora.Dictionary.load(
        './TFIDF_Word2Vec/data/tfidf-w2v_dictionary.dict')
    tfidf = TfidfModel.load('./TFIDF_Word2Vec/data/tfidf.model')
    word2vec = gensim.models.keyedvectors.Word2VecKeyedVectors.load(
        './TFIDF_Word2Vec/data/word2vec.model')
    tfidf_w2v_model = models.keyedvectors.Word2VecKeyedVectors.load(
        './TFIDF_Word2Vec/data/tfidf-w2v.model')

    query_vec = get_tfidf_w2v_vec(query, dictionary, tfidf, word2vec)
    full_entity_score_vec = tfidf_w2v_model.similar_by_vector(query_vec,
                                                              topn=False)
    sort_sims = sorted(enumerate(full_entity_score_vec),
                       key=lambda item: -item[1])

    result = []
    for i in range(10):
        dic = {
            'url': url[sort_sims[i][0]].strip('\n'),
            'JDK': JDK[sort_sims[i][0]].strip('\n'),
            'title': title[sort_sims[i][0]].strip('\n'),
            'score': sort_sims[i][1]
        }
        result.append(dic)

    return result
Exemple #9
0
    def loadmodel(self, nameprefix):
        """ Load the topic model with the given prefix of the file paths.

        Given the prefix of the file paths, load the corresponding topic model. The files
        include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict),
        and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf).

        :param nameprefix: prefix of the file paths
        :return: None
        :type nameprefix: str
        """
        # load the JSON file (parameters)
        parameters = json.load(open(nameprefix + '.json', 'rb'))
        self.nb_topics = parameters['nb_topics']
        self.toweigh = parameters['toweigh']
        self.algorithm = parameters['algorithm']
        self.classlabels = parameters['classlabels']

        # load the dictionary
        self.dictionary = Dictionary.load(nameprefix + '.gensimdict')

        # load the topic model
        self.topicmodel = gensim_topic_model_dict[self.algorithm].load(
            nameprefix + '.gensimmodel')

        # load the similarity matrix
        self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat')

        # load the tf-idf modek
        if self.toweigh:
            self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf')

        # flag
        self.trained = True
Exemple #10
0
def tfidf(dataframe, max_words=None):
    """Returns a tf-idf model for documents stored in a DataFrame.

    Precomputed models are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    max_words : int (default is 2000000)
        The maximum number of words stored by the model.

    Returns
    -------
    model : Gensim TfidfModel
        tf-idf model for documents stored in the DataFrame.
    """
    suffix = '_{}'.format(max_words) if max_words else ''
    filename = 'caches/models/tfidf{}.model'.format(suffix)

    if not os.path.isfile(filename):
        if max_words:
            dictionary = hashdictionary_corpus(dataframe, id_range=max_words)
        else:
            dictionary = dictionary_corpus(dataframe)
        tfidf_model = TfidfModel(dictionary=dictionary)
        tfidf_model.save(filename)
    else:
        tfidf_model = TfidfModel.load(filename)

    return tfidf_model
    def tfidf_model(self):
        print('Logging Info - Get Tf-idf model...')
        tfidf_model_path = os.path.join(FEATURE_DIR,
                                        '{}_tfidf.model').format(self.genre)
        dict_path = os.path.join(FEATURE_DIR,
                                 '{}_tfidf.dict').format(self.genre)
        if os.path.exists(tfidf_model_path):
            dictionary = pickle_load(dict_path)
            tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            corpus = [
                text.split() for text in self.train_data['premise'] +
                self.train_data['hypothesis'] + self.dev_data['premise'] +
                self.dev_data['hypothesis'] + self.test_data['premise'] +
                self.test_data['hypothesis']
            ]
            dictionary = corpora.Dictionary(corpus)
            corpus = [dictionary.doc2bow(text) for text in corpus]
            tfidf_model = TfidfModel(corpus)

            del corpus
            tfidf_model.save(tfidf_model_path)
            pickle_dump(dict_path, dictionary)

        return dictionary, tfidf_model
def main():
    COURSE_NAME_STUBS = [
        "agile-planning-for-software-products",
        "client-needs-and-software-requirements",
        "design-patterns",
        "introduction-to-software-product-management",
        "object-oriented-design",
        "reviews-and-metrics-for-software-improvements",
        "service-oriented-architecture",
        "software-architecture",
        "software-processes-and-agile-practices",
        "software-product-management-capstone",
    ]
    for course_name in COURSE_NAME_STUBS:
        results_fp = os.path.join(DIR_PATH, "data",
                                  "eval.{}.pkl".format(course_name))
        course_results = None
        with open(results_fp, "rb") as rf:
            course_results = load(rf)

        tfidf_fp = os.path.join(DIR_PATH, "data",
                                "tfidf.{}.pkl".format(course_name))
        # with open(tfidf_fp, "rb") as tfidf_f:
        tfidf_model = TfidfModel.load(tfidf_fp)
        idf_vec_size = len(tfidf_model.idfs)

        analyze_course_results(course_name, course_results, idf_vec_size)
 def load(conf: Configuration, force: Optional[bool] = False,
          persist: Optional[bool] = True) -> "TFIDFRanker":
     model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/'
     if force or (not os.path.exists(model_path)) \
             or (not os.path.isfile(model_path + 'corpus.mm')) \
             or (not os.path.isfile(model_path + 'tfidf.model')):
         utils.mk_dir_if_not_exists(model_path)
         dataset = TFIDFRanker.extractor.load_dataset(conf=conf)
         dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()])
         bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename'])
                       for (index, data) in dataset.iterrows()]
         bow_corpus, names = map(list, zip(*bow_corpus))
         index_mapping = TFIDFRanker.build_index_mapping(names)
         corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
         mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
         tfidf_model = TfidfModel(mm_corpus, )
         tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus],
                                              num_features=mm_corpus.num_terms)
         ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus,
                              model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf)
         ranker.persist(model_path)
         logging.info('TFIDFRanker : initialized')
         logging.info('TFIDFRanker : model : {}'.format(tfidf_model))
         logging.info('TFIDFRanker : index : {}'.format(tfidf_index))
         return ranker
     else:
         dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary')
         mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm')
         tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
         tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index')
         with open(model_path + 'index_mapping.pickle', mode='rb') as file:
             index_mapping = pickle.load(file)
             logging.info('TFIDFRanker : initialized')
         return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus,
                            model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)
Exemple #14
0
 def __init__(self):
     self.host = 'localhost'
     self.port = 3306
     self.user = '******'
     self.password = '******'
     self.db = 'gaojiruangong'
     self.charset = 'utf8'
     db = pymysql.Connect(host=self.host,
                          port=self.port,
                          user=self.user,
                          passwd=self.password,
                          db=self.db,
                          charset=self.charset)
     cursor = db.cursor()
     query_sql = "SELECT id, api FROM apisamplecode"
     cursor.execute(query_sql)
     results = cursor.fetchall()
     all_api_name_set = set()
     for item in results:
         delete_left_brackets_api_name = item[1].split('(')[0]
         all_api_name_set.add(delete_left_brackets_api_name)
         api_name = delete_left_brackets_api_name.split('.')[-1].lower()
         api_id = item[0]
         if api_name in self.api_name_2_id.keys():
             self.api_name_2_id[api_name].append(api_id)
         else:
             self.api_name_2_id[api_name] = []
             self.api_name_2_id[api_name].append(api_id)
     self.all_qualified_api_name = list(all_api_name_set)
     self.dictionary = corpora.Dictionary.load(
         ROOT_DIR + '/output/model/tfidf/tfidf_dictionary.dict')
     self.index = similarities.Similarity.load(
         ROOT_DIR + '/output/model/tfidf/tfidf_index.index')
     self.tfidf = TfidfModel.load(ROOT_DIR +
                                  '/output/model/tfidf/tfidf.model')
Exemple #15
0
 def __init__(self):
     self.stopwords = stopwords.words('english')
     # Lemmatizer
     self.lmtzr = WordNetLemmatizer()
     # Stemmer
     self.stemmer = PorterStemmer()
     self.word2vec_model = None
     self.words = re.compile(r"\w+", re.I)
     try:
         self.bigrams = Phrases.load('slm/app/cached_models/bigrams.gensim')
     except:
         self.bigrams = None
     try:
         self.trigrams = Phrases.load(
             'slm/app/cached_models/trigrams.gensim')
     except:
         self.trigrams = None
     try:
         self.dictionary = corpora.Dictionary.load(
             'slm/app/cached_models/dictionary.dict')
     except:
         self.dictionary = None
     try:
         self.tfidf = TfidfModel.load('slm/app/cached_models/tfidf.gensim')
     except:
         self.tfidf = None
Exemple #16
0
    def __init__(self, model_prefix=None, num_best=None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix +
                                                    '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix +
                                           '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix +
                                                '_similarity.index',
                                                mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Exemple #17
0
 def __init__(self):
     self.words = [["ewhor", "e-whor"], ["stresser", "booter"], [" rat "],
                   ["crypt", "fud"]]
     # with open("data/models/word2vec.modelFile", "rb") as f:
     #     self.d2v_model = pickle.load(f)
     self.tfidf_model = TfidfModel.load("data/models/1tfidf.modelFile")
     self.dct = Dictionary.load("data/models/1tfidf.dct")
Exemple #18
0
    def init(self, system, subclass):
        conn = self.data_processor.connect_db(self.conf.db_host,
                                              self.conf.db_database,
                                              self.conf.db_user,
                                              self.conf.db_pass)
        #装载词表,#装载模型
        t = time.time()

        logger.debug("正在初始化[%s-%s]的模型加载", system, subclass)

        dic_name = "dictionary_" + system + "_" + subclass + ".dic"
        dictionary = Dictionary.load(self.model_dir + "/" + dic_name)
        logger.debug("加载了字典:%s", dic_name)
        logger.debug("词袋一共%d个词", len(dictionary.keys()))

        model_name = "tfidf_" + system + "_" + subclass + ".model"
        model = TfidfModel.load(self.model_dir + "/" + model_name)
        logger.debug("加载了TFIDF模型:%s", model_name)

        df_train = pd.read_sql(
            "select * from monitor_cluster_dbscan where business_system_code='{}' and rule_type_code='{}'"
            .format(system, subclass), conn)

        #KNN聚类,然后预测
        knn = self.get_KNN_model(df_train, dictionary, model)
        duration(t, "根据字典和此分类数据,基于tfidf向量,训练出KNN模型")

        if knn is not None:
            key = system + "-" + subclass
            value = {'model': model, 'dictionary': dictionary, 'knn': knn}
            self.models[key] = value
Exemple #19
0
    def train(self):
        if not os.path.exists(os.path.join(DATA_ANSWER_PATH, 'tfidf.model')):
            traindata = p.load(open(CORPUS_PATH, 'rb'))
            for qid in self.trainset:
                duplicates = self.trainset[qid]['duplicates']
                for duplicate in duplicates:
                    question = duplicate['rel_question']['tokens']
                    traindata.append(question)

                    rel_comments = duplicate['rel_comments']
                    for rel_comment in rel_comments:
                        q2 = rel_comment['tokens']
                        traindata.append(q2)

            self.dict = Dictionary(traindata)  # fit dictionary
            corpus = [self.dict.doc2bow(line)
                      for line in traindata]  # convert corpus to BoW format
            self.tfidf = TfidfModel(corpus)  # fit model
            self.dict.save(os.path.join(DATA_ANSWER_PATH, 'dict.model'))
            self.tfidf.save(os.path.join(DATA_ANSWER_PATH, 'tfidf.model'))
        else:
            self.dict = Dictionary.load(
                os.path.join(DATA_ANSWER_PATH, 'dict.model'))
            self.tfidf = TfidfModel.load(
                os.path.join(DATA_ANSWER_PATH, 'tfidf.model'))
Exemple #20
0
    def __init__(self, dictionary_path, corpus_path, tfidf_path,
                 corpus_tfidf_path, tfidf_index_sim_path, lsi_path,
                 lsi_index_path, stopwords_path, tweet_corpus_path):

        self.dictionary = gensim.corpora.Dictionary.load(dictionary_path)
        self.corpus = MmCorpusMeta(corpus_path,
                                   id2word=self.dictionary,
                                   metadata=True)
        self.tweet_corpus = MmCorpusMeta(tweet_corpus_path,
                                         id2word=self.dictionary,
                                         metadata=True)
        self.tfidf = TfidfModel.load(tfidf_path)
        self.corpus_tfidf = gensim.utils.unpickle(corpus_tfidf_path)
        self.tfidf_index = gensim.similarities.MatrixSimilarity.load(
            tfidf_index_sim_path)
        self.lsi = LsiModel.load(lsi_path)
        self.lsi_index = gensim.similarities.MatrixSimilarity.load(
            lsi_index_path)
        with open(stopwords_path) as f:
            self.stopwords = json.load(f)

        self.tdidf_tweets = self.tfidf[self.tweet_corpus]
        self.lsi_tweets = self.lsi[self.tdidf_tweets]
        self.sim_tweets = gensim.similarities.MatrixSimilarity(self.lsi_tweets)
        print("loaded")
Exemple #21
0
 def __get_tfidf_model(self):
     if os.path.exists(os.path.join(self.out_dir, 'tfidf.model')):
         tfidf_model = TfidfModel.load(
             os.path.join(self.out_dir, 'tfidf.model'))
     else:
         raise FileNotFoundError('"tfidf.model" file not found!')
     return tfidf_model
 def _load_model(self, model_name):
     self.logger.warn('Loading DocumentRetriever models...')
     model_dir = Path('./model')
     self.dct = Dictionary.load(str(model_dir / f'{model_name}.dict'))
     self.tfidf = TfidfModel.load(str(model_dir / f'{model_name}.tfidf'))
     self.nlp = spacy.load('en_core_web_md')
     self.embeddings = KeyedVectors.load(
         str(model_dir / 'wiki-news-300d-1M-subword'))
Exemple #23
0
    def load_data(self):

        if not self.tf_idf_model:
            if not os.path.exists(self.tf_idf_model_path):
                raise Exception('TF-IDF model file not found')

            self.dictionary = Dictionary.load(self.dictionary_path)
            self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)
Exemple #24
0
def load_source():
    # 加载字典
    dictionary = corpora.Dictionary.load("./Model/dictionary.dic")
    # 加载模型
    tfidf_vectors = TfidfModel.load("./Model/tfidf_vectors.model")
    # 加载语料库
    # corpus = corpora.MmCorpus('/corpus.mm')
    return dictionary, tfidf_vectors
def remove_duplicate_code(sample_codes, descriptions):
    Threshold = 0.9

    dictionary = corpora.Dictionary.load('./output/test/tfidf_dictionary.dict')
    index = similarities.Similarity.load('./output/test/tfidf_index.index')
    tfidf = TfidfModel.load('./output/test/tfidf.model')

    remove_code_index = []

    print(len(descriptions))
    for i in range(len(descriptions)):
        vec_bow = dictionary.doc2bow(descriptions[i])
        vec_tfidf = tfidf[vec_bow]
        sims = index[vec_tfidf]
        sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
        for j in range(len(sort_sims)):
            if sort_sims[j][1] < Threshold:
                break
            else:
                if sample_codes[i]['API'] == sample_codes[
                        sort_sims[j][0]]['API'] and i != sort_sims[j][0]:
                    if abs(70 - len(sample_codes[i]['Description'].strip(
                    ).split(' '))) < abs(
                            70 - len(sample_codes[sort_sims[j][0]]
                                     ['Description'].strip().split(' '))):
                        # vec_bow1 = dictionary.doc2bow([sample_codes[i]['Code']])
                        # vec_tfidf1 = tfidf[vec_bow1]
                        # sims1 = index[vec_tfidf1]
                        #
                        # vec_bow2 = dictionary.doc2bow([sample_codes[sort_sims[j][0]]['Code']])
                        # vec_tfidf2 = tfidf[vec_bow2]
                        # sims2 = index[vec_tfidf2]

                        # if sims1[i] > sims2[sort_sims[j][0]]:
                        # if tfidf.similarity(sample_codes[i]['Code'], sample_codes[i]['Description']) > tfidf.similarity(sample_codes[sort_sims[j][0]]['Code'], sample_codes[sort_sims[j][0]]['Description']):
                        remove_code_index.append(sort_sims[j][0])
                    else:
                        remove_code_index.append(i)
        print(i)
    sample_codes_index = [i for i in range(len(sample_codes))]
    sample_codes_index = set(sample_codes_index)
    remove_code_index = set(remove_code_index)
    index = list(sample_codes_index - remove_code_index)
    sample_codes = [sample_codes[i] for i in index]

    # 将全限定名,样例代码,文本描述保存
    save_file = []
    save_path = "RemoveDuplicateSampleCode.json"
    for sample_code in sample_codes:
        json_save = {}
        json_save['API'] = sample_code['API']
        json_save['Code'] = sample_code['Code']
        json_save['Description'] = sample_code['Description']
        save_file.append(json_save)
    with open(OUTPUT_DIR + '/' + save_path, 'w',
              encoding='utf-8') as json_file:
        json.dump(save_file, json_file, indent=4)
Exemple #26
0
    def fit(self, raw_documents, y=None):
        self.analyzer_func = self.build_analyzer()

        self.model = LsiModel.load(self.model_fn)

        if os.path.exists(self.model_fn + '.tfidf'):
            self.tfidf = TfidfModel.load(self.model_fn + '.tfidf')

        return self
    def load(self):
        """
        load the corpora created by `make_corpus.py`
        """
        self.corpus = MmCorpus(self.corpus_file)
        self.dictionary = Dictionary.load_from_text(self.dict_file)
        self.titles = load_titles(self.title_file)

        self.tfidf_model = TfidfModel.load(self.tfidf_model_file)
        self.index = MatrixSimilarity(self.tfidf_model[self.corpus])
Exemple #28
0
def generate_model(dictionary, bow_corpus, corpus_path):
    try:
        tfidf = TfidfModel.load(corpus_path + 'wiki-tfidf.model')
        print('tfidf model generated')
    except:
        tfidf = TfidfModel()
        tfidf = TfidfModel(bow_corpus, dictionary)
        tfidf._smart_save(corpus_path + 'wiki-tfidf.model')
        pass
    return tfidf
 def get_tfidf(self,path):
     path = path + '.tfidf'   
     if not os.path.exists(path):
         tfidf_model = TfidfModel(self.corpus, smartirs='ntc')
         tfidf_model.save(path) 
         # перевзвешивание корпуса
         self.corpus = tfidf_model[self.corpus]
     else:
          tfidf_model = TfidfModel.load(path)
     return tfidf_model    
Exemple #30
0
    def load(self, dir_path):
        dir_path = Path(dir_path)

        vocab_path = str(dir_path / self.VOCAB_FNAME)
        model_path = str(dir_path / self.TFIDF_FNAME)
        index_path = str(dir_path / self.INDEX_FNAME)

        self.vocab = Dictionary.load(vocab_path)
        self.model = TfidfModel.load(model_path)
        self.index = SparseMatrixSimilarity.load(index_path)
Exemple #31
0
 def __load_from_disk(self, path):
     """
     Function that is used internally to load and set-up the class state
     :param path: Location from where the class internal state should be loaded
     :return: None, side-effect on the class on which this is called
     """
     # Read config,
     with open(os.path.join(path, 'config.json')) as f:
         params = jsonpickle.decode(f.read())
     self.net_size_in_days = params['net_size_in_days']
     self.min_tok_len = params['min_tok_len']
     self.undersample_multiplicity = params['undersample_multiplicity']
     self.prediction_threshold = params['prediction_threshold']
     self.use_sim_cs = params['use_sim_cs']
     self.use_sim_j = params['use_sim_j']
     self.use_sim_d = params['use_sim_d']
     self.use_social = params['use_social']
     self.use_temporal = params['use_temporal']
     self.use_file = params['use_file']
     self.use_pr_only = params['use_pr_only']
     self.use_issue_only = params['use_issue_only']
     self.predictions_between_updates = params[
         'predictions_between_updates']
     name = params['name']
     try:
         with open(os.path.join(path, name, 'repository_data.json')) as f:
             self.repository_obj = jsonpickle.decode(f.read())
         with open(os.path.join(path, name, 'truth_data.json')) as f:
             self.truth = jsonpickle.decode(f.read())
     except FileNotFoundError:
         pass
     try:
         with open(os.path.join(path, name, 'fingerprint_data.json')) as f:
             self.fingerprint = jsonpickle.decode(f.read())
     except FileNotFoundError:
         pass
     try:
         self.dictionary = Dictionary.load_from_text(
             os.path.join(path, 'tfidf', 'term2id.txt'))
         self.model = TfidfModel.load(
             os.path.join(path, 'tfidf', 'model.tfidf'))
         with open(os.path.join(path, name, 'stopwords_data.json')) as f:
             self.stopwords = jsonpickle.decode(f.read())
     except FileNotFoundError:
         pass
     try:
         self.clf = pickle.load(
             open(os.path.join(path, 'clf_model', 'model.p'), 'rb'))
     except FileNotFoundError:
         pass
     try:
         self.feature_generator = pickle.load(
             open(os.path.join(path, 'feature_generator', 'gen.p'), 'rb'))
     except FileNotFoundError:
         pass
 def __init__(self):
     self.dictionary = Dictionary.load(app.config["RCMDR_DICT"])
     self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"])
     self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"])
     self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"])
     self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"])
     self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"])
     self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"])
     self.job_labels = {
         int(k): v
         for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n"))
     }
Exemple #33
0
def _tfidf(corpus, dictionary):
    tfidf_file_name = get_tfidf_file_name(CORPUS_FILES["label"])
    try:
        tfidf = TfidfModel.load(tfidf_file_name)
    except FileNotFoundError:
        corpus_numeric = [dictionary.doc2bow(document) for document in corpus]
        tfidf = TfidfModel(corpus=corpus_numeric)
        print("File does not exist - creating the tfidf model")

        create_file_and_folders_if_not_exist(tfidf_file_name)
        tfidf.save(tfidf_file_name)

    return tfidf
Exemple #34
0
    def __init__(self, model_prefix = None, num_best = None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Exemple #35
0
#			corpus.save(f_bow)
	else: # models will be trained on your own corpus
		if os.path.exists(f_bow):
			corpus = TextCorpus.load(f_bow)
		else:
			corpus = TextCorpus(f_corpus)
#			corpus.save(f_bow)

	# filter dictionary
	corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size)
	corpus.dictionary.save(f_dict)
	corpus.save(f_bow)

	# tf-idf model
	if os.path.exists(f_tfidf):
		tfidf = TfidfModel.load(f_tfidf)
	else:
		tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
		tfidf.save(f_tfidf)

	# TRAINING

	# lsa model
	if not os.path.exists(f_lsa):
		lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim)
		lsa.save(f_lsa)

	# word2vec model
	class MyCorpus():
		def __iter__(self):
			for d in corpus.get_texts():
Exemple #36
0
#print 'Saved dictionary'

print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models')
BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus

#log_entropy = LogEntropyModel(BOW_corpus)
#log_entropy.save('../models/logEntropy.model') #already provided
log_entropy = LogEntropyModel.load('../models/logEntropy.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
log_entropy[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

#tfidf = TfidfModel(BOW_corpus)
#tfidf.save('../models/tfidf.model') #already provided
tfidf = TfidfModel.load('../models/tfidf.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
tfidf[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

print('Creating Similarity Index')
logent_corpus = MmCorpus('../data/log_entropy_matrix')
num_feat = len(wiki.dictionary.keys())
index = Similarity('../data/logEntropyShards/logEntropySimilarity',
logent_corpus, num_features=num_feat)

index.save('../data/logEntropyShards/logEntropySimilarityIndex')
print('Saved Shards and similarity index')

print('Getting list of titles...')
Exemple #37
0
def scorer(model, dic):
    tfidf = TfidfModel.load(model)
    dictionary = Dictionary.load(dic)
    def score(words):
        return tfidf[dictionary.doc2bow(words)]
    return score
    if len(sys.argv) < 2:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    model_prefix = sys.argv[1]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True

    logger.info("Finished loading model files.")

    mismatches = 0
    for doc_idx in range(0, len(similarity_index)):
        logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx]))
        rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64)
        fwd_doc = similarity_index.vector_by_id(doc_idx)
        for feature_id, val in enumerate(fwd_doc.toarray().flatten()):
            if val == 0: continue
            feat_rev_docs = similarity_index.docs_by_feature_id(feature_id).toarray().flatten()
Exemple #39
0
    if len(sys.argv) < 3:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    input_file, output_prefix = sys.argv[1:3]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(output_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True
    similarity_index.preload_reverse_index()

    logger.info("Finished loading model files.")

    logger.info("Processing input documents...")

    try:
        infile = open(input_file, 'r')
    except IOError:
        print('cannot open %s' % (input_file,))
        sys.exit(1)
# What about the raw, unprocessed unicode tweet text itself?

# In[6]:

import gzip
with gzip.open(os.path.join(DATA_PATH, 'datetimes.csv.gz'), 'rb') as f:
    nums = pd.read_csv(f, engine='python', encoding='utf-8')
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    corpus = pd.DataFrame.from_csv(f, encoding='utf8')


# Now load previously compiled vocabulary and TFIDF matrix (transformation)

# In[11]:

tfidf = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))
tfidf.num_docs


# In[17]:

bows = pd.Series(vocab.doc2bow(toks) for toks in corpus.tokens)
bows


# This would make a nice, compact sparse matrix representation of our entire corpus...  
# Which would mean we could do more in RAM at once.  
# Left as an exercise.  (check out `scipy.sparse.coo_matrix`)  

# In[18]: