Python compute_tfidfの例、tfidf.compute_tfidf Pythonの例

コード例 #1

0

ファイルを表示

def find_similar_patents(patent, citations, k):
    citation_doc_name = []
    citation_document = []
    patent_found = False
    # Path to patent text
    path_name = "D:/ASU_Courses/SWM/project/patent_text/"
    for file in glob.glob(os.path.normpath(path_name + "*.txt")):
        file_name = os.path.basename(file).split('.')[0]
        if file_name == patent:
            patent_found = True
        if file_name in citations:
            citation_doc_name.append(file_name)
            text = open(file, encoding='utf-8').read()
            citation_document.append(text)
    if not patent_found:
        print("Patent Text not available")
        return [], []
    else:

        patent_text = open(path_name + patent + ".txt",
                           encoding='utf-8').read()
        citation_document.insert(0, patent_text)
        citation_doc_name.insert(0, patent)
        tfidf_vec = tfidf.compute_tfidf(citation_document)
        #print(tfidf_vec.shape)
        similarity_indices, similarity_values = cossim.compute(
            tfidf_vec[0], tfidf_vec, k)
        patent_names_similar = [
            citation_doc_name[index] for index in similarity_indices
        ]
        return patent_names_similar, similarity_values

コード例 #2

0

ファイルを表示

ファイル: svm.py プロジェクト: DariaRoy/aspect_extract

def make_data(X_pure_train, X_sentences_train, aspects_list_train, X_pure_test,
              X_sentences_test):

    sent_word = sentiRuLex()

    w2v = W2V()
    #w2v = None

    list_of_tfidf_train = compute_tfidf(X_sentences_train)
    list_of_tfidf_test = compute_tfidf(X_sentences_test)

    y_train = make_y_train(X_pure_train, aspects_list_train)
    x_train = make_x(X_pure_train, list_of_tfidf_train, sent_word, w2v,
                     'ресторан')
    x_test = make_x(X_pure_test, list_of_tfidf_test, sent_word, w2v,
                    'автомобиль')

    return x_train, y_train, x_test

コード例 #3

0

ファイルを表示

ファイル: main.py プロジェクト: swagner-de/irws_homeworks

def main():
    # initialize
    os.chdir(os.path.dirname(os.path.realpath(__file__)))
    news_path, embeddings_path, amount, query_terms = parse_args()

    rand_docs = get_random_docs(news_path, amount)

    preprocessed = [preprocess(doc) for doc in rand_docs]

    idf = tfidf.compute_idf(preprocessed)
    tfidfs_ = [tfidf.compute_tfidf(idf, doc) for doc in preprocessed]

    emb = embedding.load_emebeddings(embeddings_path)
    embedding_vecs = [
        embedding.compute_embedding_vec(emb, tfidf_) for tfidf_ in tfidfs_
    ]

    query_tfidf = tfidf.compute_tfidf(idf, query_terms)
    query_embedded = embedding.compute_embedding_vec(emb, query_tfidf)

    ranked = tfidf.rank(query_embedded, embedding_vecs)

    pprint(ranked[:10])

コード例 #4

0

ファイルを表示

def lexrank(sentences, N, threshold, vectorizer, model):
    """
    LexRankで文章を要約する．
    @param  sentences: list
        文章 e.g.) [u'こんにちは．', u'私の名前は佐藤です．', ... ]
    @param  N: int
        文章に含まれる文の数
    @param  threshold: float
        隣接行列（類似度グラフ）を作成する際の類似度の閾値
    @param  vectorizer: string
        文のベクトル化の手法(tf-idf/word2vec)
    @return : L
        LexRankスコア
    """

    CosineMatrix = np.zeros([N, N])
    degree = np.zeros(N)
    L = np.zeros(N)

    if vectorizer == "tf-idf":
        vector = tfidf.compute_tfidf(sentences)
    elif vectorizer == "word2vec":
        vector = tfidf.compute_word2vec(sentences, model)

    # Computing Adjaceney Matrix
    for i in range(N):
        for j in range(N):
            CosineMatrix[i, j] = tfidf.compute_cosine(vector[i], vector[j])
            if CosineMatrix[i, j] > threshold:
                CosineMatrix[i, j] = 1
                degree[i] += 1
            else:
                CosineMatrix[i, j] = 0

    # Computing LexRank Score
    for i in range(N):
        for j in range(N):
            CosineMatrix[i, j] = CosineMatrix[i, j] * 1.0 / (
                degree[i] if degree[i] != 0 else 1.0)

    L = PowerMethod(CosineMatrix, N, err_tol=10e-6)

    return L

コード例 #5

0

ファイルを表示

def calc_lexrank(sentences, N, threshold, vectorizer):
    """
    LexRankで文章を要約する．
    @param  sentences: list
        文章([[w1,w2,w3],[w1,w3,w4,w5],..]のような文リスト)
    @param  n: int
        文章に含まれる文の数
    @param  t: float
        コサイン類似度の閾値(default 0.1)
    @return : list
        LexRank
    """
    CosineMatrix = np.zeros([N, N])
    degree = np.zeros(N)
    L = np.zeros(N)

    if vectorizer == "tf-idf":
        vector = tfidf.compute_tfidf(sentences)
    elif vectorizer == "word2vec":
        vector = tfidf.compute_word2vec(sentences)

    # 1. 隣接行列の作成
    for i in range(N):
        for j in range(N):
            CosineMatrix[i, j] = tfidf.compute_cosine(vector[i], vector[j])
            if CosineMatrix[i, j] > threshold:
                CosineMatrix[i, j] = 1
                degree[i] += 1
            else:
                CosineMatrix[i, j] = 0

    # 2.LexRank計算
    for i in range(N):
        for j in range(N):
            CosineMatrix[i, j] = CosineMatrix[i, j] / degree[i]

    L = PowerMethod(CosineMatrix, N, err_tol=10e-6)

    return L

コード例 #6

0

ファイルを表示

def main():
    parser = argparse.ArgumentParser(description='DNN')
    parser.add_argument('--features', '-f', help='features database')
    parser.add_argument(
        '--granularity',
        '-g',
        help=
        'granularity level (1 = coarse, label with predicate only; 2 = middle, label with predicate and adjunct if available; 3 = fine, label with full parse)'
    )
    parser.add_argument(
        '--kfactor',
        '-k',
        help=
        'Exclude every kth entry (training), include every kth entry (testing)'
    )
    parser.add_argument(
        '--offset',
        '-n',
        help=
        'Exclude every kth entry (training), include every kth entry (testing), offset by n'
    )
    parser.add_argument('--steps', '-s', help='# training steps')
    parser.add_argument('--combined',
                        '-c',
                        action='store_true',
                        help='Use combined classifier')
    parser.add_argument('--weighted',
                        '-w',
                        action='store_true',
                        help='Weight features')
    parser.add_argument('--discrete_weights_only',
                        '-d',
                        action='store_true',
                        help='Only weight discrete features')
    parser.add_argument(
        '--omit_features',
        '-o',
        action='store_true',
        help='Omit feature information (use weights only as features)')
    args = parser.parse_args()

    global features_db
    features_db = args.features

    global granularity
    granularity = int(args.granularity)

    global kfactor
    kfactor = int(args.kfactor)

    global offset
    offset = int(args.offset)

    global steps
    steps = int(args.steps)

    global combined
    combined = args.combined

    global weighted
    weighted = args.weighted

    global discrete_weights_only
    discrete_weights_only = args.discrete_weights_only

    global omit_features
    omit_features = args.omit_features

    slice = -1

    features_conn = sqlite3.connect(features_db)
    features_cur = features_conn.cursor()

    features_cur.execute('SELECT * FROM VideoDBEntry')
    results = features_cur.fetchall()[:slice]

    features_cur.execute('SELECT * FROM AlternateSentences')
    candidates = [list(r[2:]) for r in features_cur.fetchall()[:slice]]

    features_conn.close()

    for i in range(len(candidates)):
        for j in range(len(candidates[i])):
            candidates[i][j] = granularize(candidates[i][j], granularity)

    print candidates

    global dev_test
    dev_test = results[offset::kfactor]
    test_candidates = candidates[offset::kfactor]

    global train
    train = [r for r in results if r not in dev_test]
    train_canditates = [
        candidates[i] for i in range(len(candidates))
        if candidates[i] not in dev_test
    ]

    global label_set
    label_set = []

    true_labels = []

    dev_test = featurize(dev_test)
    train = featurize(train)

    for entry in dev_test:
        if granularize(entry["Input"], granularity) not in label_set:
            label_set.append(granularize(entry["Input"], granularity))
        true_labels.append(granularize(entry["Input"], granularity))

    for entry in train:
        if granularize(entry["Input"], granularity) not in label_set:
            label_set.append(granularize(entry["Input"], granularity))

    for candidate_set in candidates:
        for candidate in candidate_set:
            if candidate not in label_set:
                label_set.append(candidate)

    print label_set

    wide_columns = []
    deep_columns = []
    feature_columns = []

    global tfidf_bias
    tfidf_bias = {}

    for key in sorted(param_type):
        tfidf_bias[key] = tfidf.compute_tfidf(features_db, key)
        if param_type[key] == ParamType.discrete:
            if weighted:
                sparse_feature = tf.contrib.layers.sparse_column_with_keys(
                    key,
                    discrete_possible_values[key],
                    dtype=tf.int64
                    if type(discrete_possible_values[key][0]) is int else
                    tf.string)
                weighted_feature = tf.contrib.layers.weighted_sparse_column(
                    sparse_id_column=sparse_feature,
                    weight_column_name=key + "IDFWeight")
                feature_columns.append(
                    tf.contrib.layers.one_hot_column(sparse_feature))
                deep_columns.append(
                    tf.contrib.layers.one_hot_column(sparse_feature))
                feature_columns.append(
                    tf.contrib.layers.embedding_column(weighted_feature,
                                                       dimension=1))
                deep_columns.append(
                    tf.contrib.layers.embedding_column(weighted_feature,
                                                       dimension=1))
            else:
                feature_columns.append(
                    tf.contrib.layers.one_hot_column(
                        tf.contrib.layers.sparse_column_with_keys(
                            key,
                            discrete_possible_values[key],
                            dtype=tf.int64
                            if type(discrete_possible_values[key][0]) is int
                            else tf.string)))
                deep_columns.append(
                    tf.contrib.layers.one_hot_column(
                        tf.contrib.layers.sparse_column_with_keys(
                            key,
                            discrete_possible_values[key],
                            dtype=tf.int64
                            if type(discrete_possible_values[key][0]) is int
                            else tf.string)))
#            feature_columns.append(tf.contrib.layers.sparse_column_with_keys(key+"IDFWeight", [tfidf_bias[key]], dtype=tf.float32))
#            deep_columns.append(tf.contrib.layers.sparse_column_with_keys(key+"IDFWeight", [tfidf_bias[key]], dtype=tf.float32))
#            feature_columns.append(tf.contrib.layers.real_valued_column(key+"IDFWeight", dimension=1))
#            deep_columns.append(tf.contrib.layers.real_valued_column(key+"IDFWeight", dimension=1))
        elif param_type[key] == ParamType.continuous:
            feature_columns.append(
                tf.contrib.layers.real_valued_column(key, dimension=1))
            wide_columns.append(
                tf.contrib.layers.real_valued_column(key, dimension=1))
#            feature_columns.append(tf.contrib.layers.real_valued_column(key+"IDFWeight", dimension=1))
#            wide_columns.append(tf.contrib.layers.real_valued_column(key+"IDFWeight", dimension=1))

#    print train

    if combined:
        classifier = tf.contrib.learn.DNNLinearCombinedClassifier(
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=[10, 20, 20, 10],
            n_classes=len(label_set))
    else:
        classifier = tf.contrib.learn.DNNClassifier(
            feature_columns=feature_columns,
            hidden_units=[10, 20, 20, 10],
            n_classes=len(label_set))

    print tfidf_bias

    classifier.fit(input_fn=train_input_fn, steps=steps)

    #    print classifier.evaluate(input_fn=test_input_fn, steps=1)

    testing_data = test_input_fn
    predicted_labels = []
    probs = classifier.predict_proba(input_fn=testing_data)
    predictions = classifier.predict(input_fn=testing_data)
    pred_probs = [[i for i in predictions], [j for j in probs]]

    reference = []
    restricted_test = []
    unrestricted_test = []
    for i in range(len(pred_probs[0])):
        candidate_indices = [
            label_set.index(test_candidates[i][j])
            for j in range(len(test_candidates[i]))
        ]
        restricted_best_prob = pred_probs[1][i][candidate_indices[0]]
        restricted_best_match = label_set[candidate_indices[0]] if max(
            pred_probs[1][i]) > 0.0 else "None"
        restricted_best_match_index = candidate_indices[0] if max(
            pred_probs[1][i]) > 0.0 else None

        for k in candidate_indices:
            if pred_probs[1][i][k] > restricted_best_prob:
                restricted_best_prob = pred_probs[1][i][k]
                restricted_best_match = label_set[k]
                restricted_best_match_index = k

        multiple_choice = []
        multiple_choice.append(
            candidate_indices.index(restricted_best_match_index) + 1)
        for k in candidate_indices:
            if pred_probs[1][i][k] == restricted_best_prob:
                if candidate_indices.index(k) + 1 not in multiple_choice:
                    multiple_choice.append(candidate_indices.index(k) + 1)

        print "\nCandidates in restricted choice set: %s (indices %s)" % (
            test_candidates[i], candidate_indices)

        multiple_choice = list(set(sorted(multiple_choice)))
        print "Prediction with multiple choice option: %s" % (
            sorted(multiple_choice))
        print "Prediction with restricted choice set: %s (index %s, probability %s)" % (
            restricted_best_match, restricted_best_match_index,
            restricted_best_prob)

        unrestricted_best_match = label_set[pred_probs[0][i]] if max(
            pred_probs[1][i]) > 0.0 else "None"
        unrestricted_best_match_index = pred_probs[0][i] if max(
            pred_probs[1][i]) > 0.0 else None
        print "Prediction from unrestricted choice set: %s (index %s, probability %s)" % (
            unrestricted_best_match, unrestricted_best_match_index,
            max(pred_probs[1][i]))
        print "True label: %s" % true_labels[i]
        #        pred_probs[1][i],)
        predicted_labels.append(label_set[pred_probs[0][i]])

        restricted_test.append(restricted_best_match == true_labels[i])
        unrestricted_test.append(unrestricted_best_match == true_labels[i])
        reference.append(True)

    print "\nChoice set restricted:\n", ConfusionMatrix(
        reference, restricted_test)
    print "\nChoice set unrestricted:\n", ConfusionMatrix(
        reference, unrestricted_test)