def find_similar_patents(patent, citations, k): citation_doc_name = [] citation_document = [] patent_found = False # Path to patent text path_name = "D:/ASU_Courses/SWM/project/patent_text/" for file in glob.glob(os.path.normpath(path_name + "*.txt")): file_name = os.path.basename(file).split('.')[0] if file_name == patent: patent_found = True if file_name in citations: citation_doc_name.append(file_name) text = open(file, encoding='utf-8').read() citation_document.append(text) if not patent_found: print("Patent Text not available") return [], [] else: patent_text = open(path_name + patent + ".txt", encoding='utf-8').read() citation_document.insert(0, patent_text) citation_doc_name.insert(0, patent) tfidf_vec = tfidf.compute_tfidf(citation_document) #print(tfidf_vec.shape) similarity_indices, similarity_values = cossim.compute( tfidf_vec[0], tfidf_vec, k) patent_names_similar = [ citation_doc_name[index] for index in similarity_indices ] return patent_names_similar, similarity_values
def make_data(X_pure_train, X_sentences_train, aspects_list_train, X_pure_test, X_sentences_test): sent_word = sentiRuLex() w2v = W2V() #w2v = None list_of_tfidf_train = compute_tfidf(X_sentences_train) list_of_tfidf_test = compute_tfidf(X_sentences_test) y_train = make_y_train(X_pure_train, aspects_list_train) x_train = make_x(X_pure_train, list_of_tfidf_train, sent_word, w2v, 'ресторан') x_test = make_x(X_pure_test, list_of_tfidf_test, sent_word, w2v, 'автомобиль') return x_train, y_train, x_test
def main(): # initialize os.chdir(os.path.dirname(os.path.realpath(__file__))) news_path, embeddings_path, amount, query_terms = parse_args() rand_docs = get_random_docs(news_path, amount) preprocessed = [preprocess(doc) for doc in rand_docs] idf = tfidf.compute_idf(preprocessed) tfidfs_ = [tfidf.compute_tfidf(idf, doc) for doc in preprocessed] emb = embedding.load_emebeddings(embeddings_path) embedding_vecs = [ embedding.compute_embedding_vec(emb, tfidf_) for tfidf_ in tfidfs_ ] query_tfidf = tfidf.compute_tfidf(idf, query_terms) query_embedded = embedding.compute_embedding_vec(emb, query_tfidf) ranked = tfidf.rank(query_embedded, embedding_vecs) pprint(ranked[:10])
def lexrank(sentences, N, threshold, vectorizer, model): """ LexRankで文章を要約する. @param sentences: list 文章 e.g.) [u'こんにちは.', u'私の名前は佐藤です.', ... ] @param N: int 文章に含まれる文の数 @param threshold: float 隣接行列(類似度グラフ)を作成する際の類似度の閾値 @param vectorizer: string 文のベクトル化の手法(tf-idf/word2vec) @return : L LexRankスコア """ CosineMatrix = np.zeros([N, N]) degree = np.zeros(N) L = np.zeros(N) if vectorizer == "tf-idf": vector = tfidf.compute_tfidf(sentences) elif vectorizer == "word2vec": vector = tfidf.compute_word2vec(sentences, model) # Computing Adjaceney Matrix for i in range(N): for j in range(N): CosineMatrix[i, j] = tfidf.compute_cosine(vector[i], vector[j]) if CosineMatrix[i, j] > threshold: CosineMatrix[i, j] = 1 degree[i] += 1 else: CosineMatrix[i, j] = 0 # Computing LexRank Score for i in range(N): for j in range(N): CosineMatrix[i, j] = CosineMatrix[i, j] * 1.0 / ( degree[i] if degree[i] != 0 else 1.0) L = PowerMethod(CosineMatrix, N, err_tol=10e-6) return L
def calc_lexrank(sentences, N, threshold, vectorizer): """ LexRankで文章を要約する. @param sentences: list 文章([[w1,w2,w3],[w1,w3,w4,w5],..]のような文リスト) @param n: int 文章に含まれる文の数 @param t: float コサイン類似度の閾値(default 0.1) @return : list LexRank """ CosineMatrix = np.zeros([N, N]) degree = np.zeros(N) L = np.zeros(N) if vectorizer == "tf-idf": vector = tfidf.compute_tfidf(sentences) elif vectorizer == "word2vec": vector = tfidf.compute_word2vec(sentences) # 1. 隣接行列の作成 for i in range(N): for j in range(N): CosineMatrix[i, j] = tfidf.compute_cosine(vector[i], vector[j]) if CosineMatrix[i, j] > threshold: CosineMatrix[i, j] = 1 degree[i] += 1 else: CosineMatrix[i, j] = 0 # 2.LexRank計算 for i in range(N): for j in range(N): CosineMatrix[i, j] = CosineMatrix[i, j] / degree[i] L = PowerMethod(CosineMatrix, N, err_tol=10e-6) return L
def main(): parser = argparse.ArgumentParser(description='DNN') parser.add_argument('--features', '-f', help='features database') parser.add_argument( '--granularity', '-g', help= 'granularity level (1 = coarse, label with predicate only; 2 = middle, label with predicate and adjunct if available; 3 = fine, label with full parse)' ) parser.add_argument( '--kfactor', '-k', help= 'Exclude every kth entry (training), include every kth entry (testing)' ) parser.add_argument( '--offset', '-n', help= 'Exclude every kth entry (training), include every kth entry (testing), offset by n' ) parser.add_argument('--steps', '-s', help='# training steps') parser.add_argument('--combined', '-c', action='store_true', help='Use combined classifier') parser.add_argument('--weighted', '-w', action='store_true', help='Weight features') parser.add_argument('--discrete_weights_only', '-d', action='store_true', help='Only weight discrete features') parser.add_argument( '--omit_features', '-o', action='store_true', help='Omit feature information (use weights only as features)') args = parser.parse_args() global features_db features_db = args.features global granularity granularity = int(args.granularity) global kfactor kfactor = int(args.kfactor) global offset offset = int(args.offset) global steps steps = int(args.steps) global combined combined = args.combined global weighted weighted = args.weighted global discrete_weights_only discrete_weights_only = args.discrete_weights_only global omit_features omit_features = args.omit_features slice = -1 features_conn = sqlite3.connect(features_db) features_cur = features_conn.cursor() features_cur.execute('SELECT * FROM VideoDBEntry') results = features_cur.fetchall()[:slice] features_cur.execute('SELECT * FROM AlternateSentences') candidates = [list(r[2:]) for r in features_cur.fetchall()[:slice]] features_conn.close() for i in range(len(candidates)): for j in range(len(candidates[i])): candidates[i][j] = granularize(candidates[i][j], granularity) print candidates global dev_test dev_test = results[offset::kfactor] test_candidates = candidates[offset::kfactor] global train train = [r for r in results if r not in dev_test] train_canditates = [ candidates[i] for i in range(len(candidates)) if candidates[i] not in dev_test ] global label_set label_set = [] true_labels = [] dev_test = featurize(dev_test) train = featurize(train) for entry in dev_test: if granularize(entry["Input"], granularity) not in label_set: label_set.append(granularize(entry["Input"], granularity)) true_labels.append(granularize(entry["Input"], granularity)) for entry in train: if granularize(entry["Input"], granularity) not in label_set: label_set.append(granularize(entry["Input"], granularity)) for candidate_set in candidates: for candidate in candidate_set: if candidate not in label_set: label_set.append(candidate) print label_set wide_columns = [] deep_columns = [] feature_columns = [] global tfidf_bias tfidf_bias = {} for key in sorted(param_type): tfidf_bias[key] = tfidf.compute_tfidf(features_db, key) if param_type[key] == ParamType.discrete: if weighted: sparse_feature = tf.contrib.layers.sparse_column_with_keys( key, discrete_possible_values[key], dtype=tf.int64 if type(discrete_possible_values[key][0]) is int else tf.string) weighted_feature = tf.contrib.layers.weighted_sparse_column( sparse_id_column=sparse_feature, weight_column_name=key + "IDFWeight") feature_columns.append( tf.contrib.layers.one_hot_column(sparse_feature)) deep_columns.append( tf.contrib.layers.one_hot_column(sparse_feature)) feature_columns.append( tf.contrib.layers.embedding_column(weighted_feature, dimension=1)) deep_columns.append( tf.contrib.layers.embedding_column(weighted_feature, dimension=1)) else: feature_columns.append( tf.contrib.layers.one_hot_column( tf.contrib.layers.sparse_column_with_keys( key, discrete_possible_values[key], dtype=tf.int64 if type(discrete_possible_values[key][0]) is int else tf.string))) deep_columns.append( tf.contrib.layers.one_hot_column( tf.contrib.layers.sparse_column_with_keys( key, discrete_possible_values[key], dtype=tf.int64 if type(discrete_possible_values[key][0]) is int else tf.string))) # feature_columns.append(tf.contrib.layers.sparse_column_with_keys(key+"IDFWeight", [tfidf_bias[key]], dtype=tf.float32)) # deep_columns.append(tf.contrib.layers.sparse_column_with_keys(key+"IDFWeight", [tfidf_bias[key]], dtype=tf.float32)) # feature_columns.append(tf.contrib.layers.real_valued_column(key+"IDFWeight", dimension=1)) # deep_columns.append(tf.contrib.layers.real_valued_column(key+"IDFWeight", dimension=1)) elif param_type[key] == ParamType.continuous: feature_columns.append( tf.contrib.layers.real_valued_column(key, dimension=1)) wide_columns.append( tf.contrib.layers.real_valued_column(key, dimension=1)) # feature_columns.append(tf.contrib.layers.real_valued_column(key+"IDFWeight", dimension=1)) # wide_columns.append(tf.contrib.layers.real_valued_column(key+"IDFWeight", dimension=1)) # print train if combined: classifier = tf.contrib.learn.DNNLinearCombinedClassifier( linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=[10, 20, 20, 10], n_classes=len(label_set)) else: classifier = tf.contrib.learn.DNNClassifier( feature_columns=feature_columns, hidden_units=[10, 20, 20, 10], n_classes=len(label_set)) print tfidf_bias classifier.fit(input_fn=train_input_fn, steps=steps) # print classifier.evaluate(input_fn=test_input_fn, steps=1) testing_data = test_input_fn predicted_labels = [] probs = classifier.predict_proba(input_fn=testing_data) predictions = classifier.predict(input_fn=testing_data) pred_probs = [[i for i in predictions], [j for j in probs]] reference = [] restricted_test = [] unrestricted_test = [] for i in range(len(pred_probs[0])): candidate_indices = [ label_set.index(test_candidates[i][j]) for j in range(len(test_candidates[i])) ] restricted_best_prob = pred_probs[1][i][candidate_indices[0]] restricted_best_match = label_set[candidate_indices[0]] if max( pred_probs[1][i]) > 0.0 else "None" restricted_best_match_index = candidate_indices[0] if max( pred_probs[1][i]) > 0.0 else None for k in candidate_indices: if pred_probs[1][i][k] > restricted_best_prob: restricted_best_prob = pred_probs[1][i][k] restricted_best_match = label_set[k] restricted_best_match_index = k multiple_choice = [] multiple_choice.append( candidate_indices.index(restricted_best_match_index) + 1) for k in candidate_indices: if pred_probs[1][i][k] == restricted_best_prob: if candidate_indices.index(k) + 1 not in multiple_choice: multiple_choice.append(candidate_indices.index(k) + 1) print "\nCandidates in restricted choice set: %s (indices %s)" % ( test_candidates[i], candidate_indices) multiple_choice = list(set(sorted(multiple_choice))) print "Prediction with multiple choice option: %s" % ( sorted(multiple_choice)) print "Prediction with restricted choice set: %s (index %s, probability %s)" % ( restricted_best_match, restricted_best_match_index, restricted_best_prob) unrestricted_best_match = label_set[pred_probs[0][i]] if max( pred_probs[1][i]) > 0.0 else "None" unrestricted_best_match_index = pred_probs[0][i] if max( pred_probs[1][i]) > 0.0 else None print "Prediction from unrestricted choice set: %s (index %s, probability %s)" % ( unrestricted_best_match, unrestricted_best_match_index, max(pred_probs[1][i])) print "True label: %s" % true_labels[i] # pred_probs[1][i],) predicted_labels.append(label_set[pred_probs[0][i]]) restricted_test.append(restricted_best_match == true_labels[i]) unrestricted_test.append(unrestricted_best_match == true_labels[i]) reference.append(True) print "\nChoice set restricted:\n", ConfusionMatrix( reference, restricted_test) print "\nChoice set unrestricted:\n", ConfusionMatrix( reference, unrestricted_test)