Beispiel #1
0
def main(threshold=THRESHOLD):
    # load_data
    title, check, docs, tags = load_file()
    title_docs = title.split('\n')[:-1]
    ## parse titles to documents
    for i in range(len(title_docs)):
        title_docs[i] = ps.removeUselessContent(title_docs[i])
    stopwords = load_stopwords()

    # count tf & idf of corpus
    tmpTitle = ps.removeUselessContent(title)
    # Terms, Model = ps.generalModel(docs+tmpTitle, title_docs)
    Terms, Model = pickle.load(open("model/terms_ver_NB.pkl", "rb"))

    # count tf of documents
    title_models = []
    for i in range(len(title_docs)):
        term, model = ps.generalModel(title_docs[i])
        title_models.append({
            'term': term,
            'tf': model['tf'],
            'length': len(title_docs[i])
        })
        title_models[i]['tags'] = []
        for j in range(len(tags)):
            if tags[j] in title_docs[i]:
                title_models[i]['tags'].append(j)
    print title_models[0]['tf']['a'], title_models[0]['length'], title_models[
        0]['tags']

    pickle.dump((Terms, Model), open("model/terms_ver_NB.pkl", "wb"), True)
    pickle.dump(title_models, open("model/title_models_ver_tag.pkl", "wb"),
                True)

    # test documents pairs
    out = open(sys.argv[2], 'w')
    out.write('ID,Ans\n')
    Yes = No = 0
    for i in range(len(check)):
        doc1 = check[i][0]
        doc2 = check[i][1]
        prob = -float("inf")
        for tag in title_models[doc1]['tags']:
            if tag in title_models[doc2]['tags']:
                print "index", i, "Tag found!"
                prob = 1
        if prob >= threshold:
            Yes += 1
            out.write(str(i) + ',' + str(1) + '\n')
        else:
            No += 1
            out.write(str(i) + ',' + str(0) + '\n')
    print("Yes:%d, No:%d" % (Yes, No))
Beispiel #2
0
def load_file():
    # process data
    if sys.argv[1][-1] != '/':
        sys.argv[1] += '/'
    title = open(sys.argv[1] + 'title_StackOverflow.txt', 'r').read()
    check = open(sys.argv[1] + 'check_index.csv', 'r').read().split('\n')[1:-1]
    docs = open(sys.argv[1] + 'docs.txt', 'r').read()

    for i in range(len(check)):
        check[i] = [int(x) for x in check[i].split(',')[1:]]
    # print check[0], check[-1]

    docs = ps.removeUselessContent(docs)
    # print docs[:40], docs[-40:]

    return title, check, docs
Beispiel #3
0
def main(threshold=THRESHOLD):
    ## load_data ##
    title, check, docs = load_file()
    stopwords = load_stopwords()

    ## parse data ##
    # parse titles to documents
    title_docs = title.split('\n')[:-1]
    for i in range(len(title_docs)):
        title_docs[i] = ps.removeUselessContent(title_docs[i])
        title_docs[i] = [w for w in title_docs[i] if w not in stopwords]
    tmpTitle = ps.removeUselessContent(title)
    tmpout = open('model/words.txt', 'w')
    count = 0
    for word in docs + tmpTitle:
        if word in stopwords:
            continue
        tmpout.write(word + ' ')
        count += 1
        if count % 100 == 0:
            tmpout.write('\n')

    ## training ##
    # build word vector
    corpus = word2vec.Text8Corpus("model/words.txt")
    model = word2vec.Word2Vec(corpus, size=VEC_SIZE)
    model.save_word2vec_format(u"model/title_vector.txt", binary=False)
    model.save_word2vec_format(u"model/title_vector.bin", binary=True)

    ## load model ##
    # wordVec = word2vec.Word2Vec.load_word2vec_format('model/title_vector.txt', binary=False)
    wordVec = word2vec.Word2Vec.load_word2vec_format('model/title_vector.bin',
                                                     binary=True)
    # vector_file = open('model/title_vector.txt', 'r').read().split("\n")[1:-1]
    # vector_file = [x.split() for x in vector_file]
    # wordVec = {}
    # for i in range(len(vector_file)):
    #     vector_file[i][1:] = [float(x) for x in vector_file[i][1:]]
    #     wordVec[vector_file[i][0]] = np.array(vector_file[i][1:])
    # # print wordVec[vector_file[0][0]], wordVec[vector_file[-1][0]]
    titleVec = []
    for i in range(len(title_docs)):
        tmpVec = np.zeros((VEC_SIZE))
        for term in title_docs[i]:
            if term not in wordVec or term in stopwords:
                continue
            tmpVec += wordVec[term]
        titleVec.append(tmpVec)
    # print 'titleVec[0]', titleVec[0], 'titleVec[-1]', titleVec[-1]

    ## test documents pairs ##
    out = open(sys.argv[2], 'w')
    score = open('score', 'w')
    out.write('ID,Ans\n')
    Min = float("inf")
    Max = -float("inf")
    MinPos = MaxPos = 0
    Yes = No = 0
    Sum = 0.
    if SIMILARITY == 'Cosine':
        print('=== Using Cosine Similarity, threshold: %f ===' % (threshold))
    elif SIMILARITY == 'WMD':
        print("=== Using Word Mover's Distance, threshold: %f ===" %
              (threshold))
    for i in range(len(check)):
        doc1 = check[i][0]
        doc2 = check[i][1]
        if SIMILARITY == 'Cosine':
            # cosine similarity
            answer = ct.vecCosineSimilarity(titleVec[doc1], titleVec[doc2])
        elif SIMILARITY == 'WMD':
            # Word Mover's Distance
            answer = wordVec.wmdistance(title_docs[doc1], title_docs[doc2])
        if math.isnan(answer) == False and answer != float("inf"):
            Sum += answer
        if math.isnan(answer) == False:
            score.write(str(answer) + '\n')
        else:
            score.write(str(float("inf")) + '\n')
        if i % 50000 == 0:
            print 'producing index', i, 'answer:', answer
        if answer < Min:
            Min = answer
            MinPos = i
        if answer > Max:
            Max = answer
            MaxPos = i
        if checkAnswer(answer) == True:
            Yes += 1
            out.write(str(i) + ',' + str(1) + '\n')
        else:
            No += 1
            out.write(str(i) + ',' + str(0) + '\n')
    out.close()
    print("MinPos:%d, Min:%f" % (MinPos, Min))
    print("MaxPos:%d, Max:%f" % (MaxPos, Max))
    print("Yes:%d, No:%d" % (Yes, No))
    print("Sum:%f, Mean:%f" % (Sum, Sum / len(check)))
Beispiel #4
0
def main(threshold=THRESHOLD):
    ## load_data ##
    title, check, docs = load_file()
    stopwords = load_stopwords()

    ## parse data ##
    title_docs = title.split('\n')[:-1]
    # parse titles to documents
    for i in range(len(title_docs)):
        title_docs[i] = ps.removeUselessContent(title_docs[i])

    if VERSION == 'sklearn':
        for i in range(len(title_docs)):
            title_docs[i] = ps.removeStopwords(title_docs[i], stopwords)
            title_docs[i] = " ".join(title_docs[i])
    else:
        # count tf & idf of corpus
        tmpTitle = ps.removeUselessContent(title)
        # Terms, Model = ps.generalModel(docs+tmpTitle, title_docs)
        Terms, Model = pickle.load(open("model/terms_ver_cosine.pkl", "rb"))
        pickle.dump((Terms, Model), open("model/terms_ver_cosine.pkl", "wb"),
                    True)

        # count tf of documents
        title_models = []
        for i in range(len(title_docs)):
            terms, model = ps.generalModel(title_docs[i])
            terms = ps.removeStopwords(terms, stopwords)
            model = ps.parseTFIDF(terms, model, Model)
            title_models.append({'terms': terms, 'tfidf': model['tfidf']})
        # print title_models[0]['tf']['a'],title_models[0]['length']
        pickle.dump(title_models,
                    open("model/title_models_ver_cosine.pkl", "wb"), True)

    # test documents pairs
    out = open(sys.argv[2], 'w')
    out.write('ID,Ans\n')
    Min = float("inf")
    Max = -float("inf")
    MinPos = MaxPos = 0
    Yes = No = 0
    Sum = 0
    if VERSION == 'sklearn':
        print "generating cosine matrix......"
        cosineMatrix = ct.cosineMatrix(title_docs)
        for i in range(len(check)):
            doc1 = check[i][0]
            doc2 = check[i][1]
            cosineSimilarity = cosineMatrix[doc1][doc2]
            if i % 50000 == 0:
                print 'producing index', i, 'cosineSimilarity:', cosineSimilarity
            if cosineSimilarity < Min:
                Min = cosineSimilarity
                MinPos = i
            if cosineSimilarity > Max:
                Max = cosineSimilarity
                MaxPos = i
            Sum += cosineSimilarity
            if cosineSimilarity >= threshold:
                Yes += 1
                out.write(str(i) + ',' + str(1) + '\n')
            else:
                No += 1
                out.write(str(i) + ',' + str(0) + '\n')
    else:
        for i in range(len(check)):
            doc1 = check[i][0]
            doc2 = check[i][1]
            cosineSimilarity = ct.docCosineSimilarity(title_models[doc1],
                                                      title_models[doc2],
                                                      title_docs[doc1],
                                                      title_docs[doc2])
            if i % 50000 == 0:
                print 'producing index', i, 'cosineSimilarity:', cosineSimilarity
            if cosineSimilarity < Min:
                Min = cosineSimilarity
                MinPos = i
            if cosineSimilarity > Max:
                Max = cosineSimilarity
                MaxPos = i
            Sum += cosineSimilarity
            if cosineSimilarity >= threshold:
                Yes += 1
                out.write(str(i) + ',' + str(1) + '\n')
            else:
                No += 1
                out.write(str(i) + ',' + str(0) + '\n')
    print("MinPos:%d, Min:%f" % (MinPos, Min))
    print("MaxPos:%d, Max:%f" % (MaxPos, Max))
    print("Yes:%d, No:%d" % (Yes, No))
    print("Sum:%f, Mean:%f" % (Sum, Sum / len(check)))