def main(threshold=THRESHOLD): # load_data title, check, docs, tags = load_file() title_docs = title.split('\n')[:-1] ## parse titles to documents for i in range(len(title_docs)): title_docs[i] = ps.removeUselessContent(title_docs[i]) stopwords = load_stopwords() # count tf & idf of corpus tmpTitle = ps.removeUselessContent(title) # Terms, Model = ps.generalModel(docs+tmpTitle, title_docs) Terms, Model = pickle.load(open("model/terms_ver_NB.pkl", "rb")) # count tf of documents title_models = [] for i in range(len(title_docs)): term, model = ps.generalModel(title_docs[i]) title_models.append({ 'term': term, 'tf': model['tf'], 'length': len(title_docs[i]) }) title_models[i]['tags'] = [] for j in range(len(tags)): if tags[j] in title_docs[i]: title_models[i]['tags'].append(j) print title_models[0]['tf']['a'], title_models[0]['length'], title_models[ 0]['tags'] pickle.dump((Terms, Model), open("model/terms_ver_NB.pkl", "wb"), True) pickle.dump(title_models, open("model/title_models_ver_tag.pkl", "wb"), True) # test documents pairs out = open(sys.argv[2], 'w') out.write('ID,Ans\n') Yes = No = 0 for i in range(len(check)): doc1 = check[i][0] doc2 = check[i][1] prob = -float("inf") for tag in title_models[doc1]['tags']: if tag in title_models[doc2]['tags']: print "index", i, "Tag found!" prob = 1 if prob >= threshold: Yes += 1 out.write(str(i) + ',' + str(1) + '\n') else: No += 1 out.write(str(i) + ',' + str(0) + '\n') print("Yes:%d, No:%d" % (Yes, No))
def load_file(): # process data if sys.argv[1][-1] != '/': sys.argv[1] += '/' title = open(sys.argv[1] + 'title_StackOverflow.txt', 'r').read() check = open(sys.argv[1] + 'check_index.csv', 'r').read().split('\n')[1:-1] docs = open(sys.argv[1] + 'docs.txt', 'r').read() for i in range(len(check)): check[i] = [int(x) for x in check[i].split(',')[1:]] # print check[0], check[-1] docs = ps.removeUselessContent(docs) # print docs[:40], docs[-40:] return title, check, docs
def main(threshold=THRESHOLD): ## load_data ## title, check, docs = load_file() stopwords = load_stopwords() ## parse data ## # parse titles to documents title_docs = title.split('\n')[:-1] for i in range(len(title_docs)): title_docs[i] = ps.removeUselessContent(title_docs[i]) title_docs[i] = [w for w in title_docs[i] if w not in stopwords] tmpTitle = ps.removeUselessContent(title) tmpout = open('model/words.txt', 'w') count = 0 for word in docs + tmpTitle: if word in stopwords: continue tmpout.write(word + ' ') count += 1 if count % 100 == 0: tmpout.write('\n') ## training ## # build word vector corpus = word2vec.Text8Corpus("model/words.txt") model = word2vec.Word2Vec(corpus, size=VEC_SIZE) model.save_word2vec_format(u"model/title_vector.txt", binary=False) model.save_word2vec_format(u"model/title_vector.bin", binary=True) ## load model ## # wordVec = word2vec.Word2Vec.load_word2vec_format('model/title_vector.txt', binary=False) wordVec = word2vec.Word2Vec.load_word2vec_format('model/title_vector.bin', binary=True) # vector_file = open('model/title_vector.txt', 'r').read().split("\n")[1:-1] # vector_file = [x.split() for x in vector_file] # wordVec = {} # for i in range(len(vector_file)): # vector_file[i][1:] = [float(x) for x in vector_file[i][1:]] # wordVec[vector_file[i][0]] = np.array(vector_file[i][1:]) # # print wordVec[vector_file[0][0]], wordVec[vector_file[-1][0]] titleVec = [] for i in range(len(title_docs)): tmpVec = np.zeros((VEC_SIZE)) for term in title_docs[i]: if term not in wordVec or term in stopwords: continue tmpVec += wordVec[term] titleVec.append(tmpVec) # print 'titleVec[0]', titleVec[0], 'titleVec[-1]', titleVec[-1] ## test documents pairs ## out = open(sys.argv[2], 'w') score = open('score', 'w') out.write('ID,Ans\n') Min = float("inf") Max = -float("inf") MinPos = MaxPos = 0 Yes = No = 0 Sum = 0. if SIMILARITY == 'Cosine': print('=== Using Cosine Similarity, threshold: %f ===' % (threshold)) elif SIMILARITY == 'WMD': print("=== Using Word Mover's Distance, threshold: %f ===" % (threshold)) for i in range(len(check)): doc1 = check[i][0] doc2 = check[i][1] if SIMILARITY == 'Cosine': # cosine similarity answer = ct.vecCosineSimilarity(titleVec[doc1], titleVec[doc2]) elif SIMILARITY == 'WMD': # Word Mover's Distance answer = wordVec.wmdistance(title_docs[doc1], title_docs[doc2]) if math.isnan(answer) == False and answer != float("inf"): Sum += answer if math.isnan(answer) == False: score.write(str(answer) + '\n') else: score.write(str(float("inf")) + '\n') if i % 50000 == 0: print 'producing index', i, 'answer:', answer if answer < Min: Min = answer MinPos = i if answer > Max: Max = answer MaxPos = i if checkAnswer(answer) == True: Yes += 1 out.write(str(i) + ',' + str(1) + '\n') else: No += 1 out.write(str(i) + ',' + str(0) + '\n') out.close() print("MinPos:%d, Min:%f" % (MinPos, Min)) print("MaxPos:%d, Max:%f" % (MaxPos, Max)) print("Yes:%d, No:%d" % (Yes, No)) print("Sum:%f, Mean:%f" % (Sum, Sum / len(check)))
def main(threshold=THRESHOLD): ## load_data ## title, check, docs = load_file() stopwords = load_stopwords() ## parse data ## title_docs = title.split('\n')[:-1] # parse titles to documents for i in range(len(title_docs)): title_docs[i] = ps.removeUselessContent(title_docs[i]) if VERSION == 'sklearn': for i in range(len(title_docs)): title_docs[i] = ps.removeStopwords(title_docs[i], stopwords) title_docs[i] = " ".join(title_docs[i]) else: # count tf & idf of corpus tmpTitle = ps.removeUselessContent(title) # Terms, Model = ps.generalModel(docs+tmpTitle, title_docs) Terms, Model = pickle.load(open("model/terms_ver_cosine.pkl", "rb")) pickle.dump((Terms, Model), open("model/terms_ver_cosine.pkl", "wb"), True) # count tf of documents title_models = [] for i in range(len(title_docs)): terms, model = ps.generalModel(title_docs[i]) terms = ps.removeStopwords(terms, stopwords) model = ps.parseTFIDF(terms, model, Model) title_models.append({'terms': terms, 'tfidf': model['tfidf']}) # print title_models[0]['tf']['a'],title_models[0]['length'] pickle.dump(title_models, open("model/title_models_ver_cosine.pkl", "wb"), True) # test documents pairs out = open(sys.argv[2], 'w') out.write('ID,Ans\n') Min = float("inf") Max = -float("inf") MinPos = MaxPos = 0 Yes = No = 0 Sum = 0 if VERSION == 'sklearn': print "generating cosine matrix......" cosineMatrix = ct.cosineMatrix(title_docs) for i in range(len(check)): doc1 = check[i][0] doc2 = check[i][1] cosineSimilarity = cosineMatrix[doc1][doc2] if i % 50000 == 0: print 'producing index', i, 'cosineSimilarity:', cosineSimilarity if cosineSimilarity < Min: Min = cosineSimilarity MinPos = i if cosineSimilarity > Max: Max = cosineSimilarity MaxPos = i Sum += cosineSimilarity if cosineSimilarity >= threshold: Yes += 1 out.write(str(i) + ',' + str(1) + '\n') else: No += 1 out.write(str(i) + ',' + str(0) + '\n') else: for i in range(len(check)): doc1 = check[i][0] doc2 = check[i][1] cosineSimilarity = ct.docCosineSimilarity(title_models[doc1], title_models[doc2], title_docs[doc1], title_docs[doc2]) if i % 50000 == 0: print 'producing index', i, 'cosineSimilarity:', cosineSimilarity if cosineSimilarity < Min: Min = cosineSimilarity MinPos = i if cosineSimilarity > Max: Max = cosineSimilarity MaxPos = i Sum += cosineSimilarity if cosineSimilarity >= threshold: Yes += 1 out.write(str(i) + ',' + str(1) + '\n') else: No += 1 out.write(str(i) + ',' + str(0) + '\n') print("MinPos:%d, Min:%f" % (MinPos, Min)) print("MaxPos:%d, Max:%f" % (MaxPos, Max)) print("Yes:%d, No:%d" % (Yes, No)) print("Sum:%f, Mean:%f" % (Sum, Sum / len(check)))