def tfidf_calculation_multistage(path_list_1, path_list_2):
    t = tfidf.TFIDF()
    for folder in path_list_1:
        a = t.get_tf_idf(folder)
    t = tfidf.TFIDF()
    for folder in path_list_2:
        b = t.get_tf_idf(folder)
    a_features = [t.tfidf for t in a]
    b_features = [t.tfidf for t in b]
    print(a_features)
    assert (a_features == b_features)
def tfidf_obj_creation_perf(dataset_path):
    t = tfidf.TFIDF()
    dirs = [dataset_path]
    for dir in dirs:
        ans = t.get_tf_idf(dir)
    with open("/tmp/result.txt", 'w') as f:
        f.write(str(ans))
def encode_tfidf_model(document_type, word_thresh=1):
    """Load TF-IDF model.
    """

    tfidf_fname = utils.TFIDF_TEMPLATE % (document_type, word_thresh)
    check_save_directory(filename=tfidf_fname)

    if os.path.exists(tfidf_fname):
        with open(tfidf_fname, 'rb') as fid:
            TFIDF = pickle.load(fid)

    else:
        # read the story and gather words
        story, _ = mqa.get_story_qa_data('full', document_type)
        sorted_movies = sorted(story.keys())
        all_words_use = []
        for imdb_key in sorted_movies:
            all_words_use.append([])
            for sentence in story[imdb_key]:
                norm_sentence = utils.normalize_stemming(
                    utils.normalize_alphanumeric(sentence.lower()))
                all_words_use[-1].extend(norm_sentence.split(' '))

        # compute TFIDF
        TFIDF = tfidfcalc.TFIDF(sorted_movies)
        TFIDF.get_filtered_vocabulary(all_words_use, word_thresh=word_thresh)
        TFIDF.compute_tfidf(all_words_use)

        # dump to pickle file for future
        with open(tfidf_fname, 'wb') as fid:
            pickle.dump(TFIDF, fid)

    return TFIDF
Ejemplo n.º 4
0
def calcTfIdf(totalList, chidDict, keyword="digest"):
    # 建立tf-idf模型
    print "tf-idf calculating..."
    myTfIdf = tfidf.TFIDF(totalList)
    # 计算idf值
    idfDict = myTfIdf.getIdf()
    # 显示超过总文章数目的1%中都出现的词及其idf值
    print[(x.encode('utf-8'), idfDict[x]) for x in idfDict if idfDict[x] < 2]
    # 计算在某一分类中词语的tf值
    tfidfMatrix = {}
    for index, value in chidDict.iteritems():
        tfDict = myTfIdf.getTf(value)
        # 计算TF-IDF值,存在tfidfMatrix中
        for one in myTfIdf.getTfIdf(tfDict, idfDict):
            oneClass = tfidfMatrix.setdefault(one[0], dict())
            oneClass[index] = one[1]
    # save tfidfMatrix
    with open("%s_tfidf.txt" % keyword, 'w+') as outfile:
        classNames = list(chidDict.keys())
        print >> outfile, "\t%s" % ('\t'.join([str(x) for x in classNames]))
        for token, classDict in tfidfMatrix.iteritems():
            List = [token.encode("utf-8")]
            for one in classNames:
                List.append(str(classDict.get(one, 0.0)))
            print >> outfile, '\t'.join(List)
Ejemplo n.º 5
0
def RankedSearch(index, querypath, n):

    #QUERY PROCESSING & TFIDF
    queries = []
    with open(querypath, 'r') as f:
        for line in f:
            query = line.strip("\n").strip("\r")
            query.split(" ")
            query = ts.stem_tokens(ts.removestops(ts.tokenizeText(query)))
            queries.append(query)

    #TFIDF
    resultmatrix = []
    for query in queries:
        result = tfidf.TFIDF(query, index, n)
        print(str(query))
        print(str(len(result)) + " documents found. Top 10:")
        for line in result[:10]:
            print(
                str(line[0]) + " 0 " + str(line[2]) + " 0 " + str(line[4]) +
                " 0")
        resultmatrix.append(result)
        print("\n")

    ##OUTPUT
    ##another output file version:
    #with open("tfidf_results.txt", 'w') as f:
    #	for i, results in enumerate(resultmatrix):
    #		f.write("Query: "+str(queries[i])+"\n")
    #		for line in results[:10]:
    #			f.write(str(line).replace(','," ")+"\n")
    #		f.write("\n")

    with open("results.ranked.txt", 'w') as f:
        for i, results in enumerate(resultmatrix):
            f.write("Query: " + str(queries[i]) + "\n")
            if len(results) > 1000:
                results = results[:1000]
            for line in results:
                f.write(
                    str(line[0]) + " 0 " + str(line[2]) + " 0 " +
                    str(line[4]) + " 0 \n")
            f.write("\n")

    return resultmatrix
Ejemplo n.º 6
0
    def __init__(self, ftp_folder, data_dir, tfidf_storage_dir, interval):

        self.queue = multiprocessing.SimpleQueue()
        self.ftp_folder = ftp_folder
        self.data_dir = data_dir
        self.tfidf_storage_dir = tfidf_storage_dir
        self.interval = interval

        #if not os.path.exists(self.data_dir ):
        #    os.makedirs(self.data_dir)

        #if not os.path.exists(self.tfidf_storage_dir ):
        #    os.makedirs(self.tfidf_storage_dir)

        self.tfidf_obj = tfidf.TFIDF()
        self.hostname = socket.gethostbyname(socket.gethostname())
        self.daemon = threading.Thread(target=self.tfidf_generater, args=())
        self.do_exit = False
        self.daemon.start()
def basic_tfidf_obj(path):
    t = tfidf.TFIDF()
    b = t.get_tf_idf(path)
    print(b[0].tfidf)