Beispiel #1
0
def main():
    # load arguments
    args = parse_args()

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    # define logging level and format
    level = logging.INFO
    if args.debug:
        level = logging.DEBUG

    logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=level)

    # load splits
    test_doc_ids = utils.read_split(config["split"])
    logging.info(f"Number of test documents: {len(test_doc_ids)}")

    # load dataset
    dataset = utils.read_jsonl(config["dataset"], dict_key="id")

    # create word embeddings for scene labels
    if os.path.basename(config["scene_labels"]) == "places365_en.txt":
        language = "en"
    else:  # places365_de.txt
        language = "de"

    logging.info('Generate word embedding for scene labels ...')
    scene_labels = read_scene_labels(config["scene_labels"])
    scene_word_embeddings = get_scene_word_embeddings(scene_labels,
                                                      fasttext_bin_folder=args.fasttext,
                                                      language=language)

    # generate results for each document
    testset_similarities = {}
    with multiprocessing.Pool(args.threads) as p:
        pool_args = [(doc, test_doc_ids, scene_word_embeddings, config) for doc in dataset.values()]

        cnt_docs = 0
        for document_result in p.imap(calculate_results, pool_args):
            if document_result is None:
                continue

            cnt_docs += 1
            if cnt_docs % 100 == 0:
                logging.info(f"{cnt_docs} / {len(test_doc_ids)} documents processed ...")

            for key, val in document_result.items():
                if key not in testset_similarities:
                    testset_similarities[key] = []

                testset_similarities[key].append(val)

    results = metrics.calculate_metrics(testset_similarities)
    metrics.print_results(results)

    return 0
Beispiel #2
0
def process_reviews(product_name):
    print("Processing Reviews...")
    ratings_documents = read_split('sample_reviews.json', product_name)
    documents_rating_1 = ratings_documents[1]
    documents_rating_5 = ratings_documents[5]

    bow_corpus_1, dictionary_1 = process_single_product_reviews(
        documents_rating_1)
    bow_corpus_5, dictionary_5 = process_single_product_reviews(
        documents_rating_5)
    return bow_corpus_1, dictionary_1, bow_corpus_5, dictionary_5, documents_rating_1, documents_rating_5
Beispiel #3
0
def bert_model(product_name):
    log.info("Processing Reviews...")
    
    ratings_documents, raw_data = read_split('sample_reviews.json', product_name)
    documents_rating_1 = ratings_documents[1]
    documents_rating_5 = ratings_documents[5]
    corpus_1 = documents_rating_1['reviewText'].to_list()
    corpus_5 = documents_rating_5['reviewText'].to_list()
    # print(corpus_5)
    corpus_embeddings_1 = bert.encode(corpus_1)
    corpus_embeddings_5 = bert.encode(corpus_5)
    
    return corpus_embeddings_1, corpus_embeddings_5, corpus_1, corpus_5
Beispiel #4
0
def main():
    # load arguments
    args = parse_args()

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    # define logging level and format
    level = logging.INFO
    if args.debug:
        level = logging.DEBUG

    logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s",
                        datefmt="%Y-%m-%d %H:%M:%S",
                        level=level)

    # load splits
    test_doc_ids = utils.read_split(config["split"])
    logging.info(f"Number of test documents: {len(test_doc_ids)}")

    # load dataset
    dataset = utils.read_jsonl(config["dataset"], dict_key="id")

    # generate results for each document
    testset_similarities = {}
    with multiprocessing.Pool(args.threads) as p:
        pool_args = [(doc, test_doc_ids, config) for doc in dataset.values()]

        cnt_docs = 0
        for document_result in p.imap(calculate_results, pool_args):
            if document_result is None:
                continue

            cnt_docs += 1
            if cnt_docs % 100 == 0:
                logging.info(
                    f"{cnt_docs} / {len(test_doc_ids)} documents processed ..."
                )

            for key, val in document_result.items():
                if key not in testset_similarities:
                    testset_similarities[key] = []

                testset_similarities[key].append(val)

    results = metrics.calculate_metrics(testset_similarities)
    metrics.print_results(results)

    return 0
Beispiel #5
0
        edge_types.append((type_num_dict[j], type_num_dict[i]))

# Load data
G = load_graph_data(graph_path)

adjs_orig = get_edge_adj_matrices(G, {et: None for et in edge_types_strings})

# # get adjajcency matrices for subgraphs
adj_orig = nx.to_scipy_sparse_matrix(G)
adj_orig = adj_orig - sp.dia_matrix(
    (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

p = dataset_path + "random_splits/" + edge_type + "/random" + str(
    random_seed) + "/"
G_train, test_positive_e, test_negative_e, val_positive_e, val_negative_e, train_edges = read_split(
    G, edge_type.split("_"), random_seed, p)

t0 = time.time()
adjs_train = get_edge_adj_matrices(
    G_train, {et: adjs_orig[et]["nodes"]
              for et in adjs_orig})

adj_train = nx.to_scipy_sparse_matrix(G_train)
# adj = adj_train

k = tuple([type_num_dict[t] for t in edge_type.split("_")])
print("k", k, edge_type)

nodes0 = adjs_orig[edge_type]["nodes"][0]
nodes1 = adjs_orig[edge_type]["nodes"][1]
Beispiel #6
0
    length = int(sys.argv[6])  # 100

    f = open(graph_path)
    G = nx.Graph()

    for line in f:
        a, b = line.strip().split("\t")
        G.add_edge(a, b)

    G.remove_edges_from(G.selfloop_edges())
    GC = max(nx.connected_component_subgraphs(G),
             key=len)  # take greatest connected component

    p = path + "random_splits/" + edge_type[0] + "_" + edge_type[
        1] + "/random" + str(random_seed) + "/"
    G_train, test_positive, test_negative, val_positive, val_negative, train_edges = read_split(
        GC, edge_type, random_seed, p)

    t0 = time.time()

    print("Meta path classifier")
    if dataset == "bio":
        mpg = MetaPathGeneratorBio(random_seed)
    elif dataset == "sicris":
        mpg = MetaPathGeneratorSicris(random_seed)
    elif dataset == "imdb":
        mpg = MetaPathGeneratorImdb(random_seed)
    elif dataset == "amazon":
        mpg = MetaPathGeneratorAmazon(random_seed)
    elif dataset == "yelp":
        mpg = MetaPathGeneratorYelp(random_seed)
Beispiel #7
0
    f = open(graph_path)
    G = nx.Graph()

    for line in f:
        a, b = line.strip().split("\t")
        G.add_edge(a, b)

    G.remove_edges_from(G.selfloop_edges())
    print(G.number_of_nodes())
    GC = max(nx.connected_component_subgraphs(G),
             key=len)  # take greatest connected component
    print(GC.number_of_nodes())
    p = path + "random_splits/" + edge_type[0] + "_" + edge_type[
        1] + "/random" + str(num) + "/"
    print(p)
    G_train, test_positive, test_negative, val_positive, val_negative, train_edges = read_split(
        GC, edge_type, num, p)

    p = path + "features/" + edge_type[0] + "_" + edge_type[
        1] + "/random" + str(num) + "/"
    t0 = time.time()
    simple_model = SimpleClassifier(G_train, train_edges, test_positive,
                                    test_negative, val_positive, val_negative,
                                    p)
    t1 = time.time()
    print("Preparation:", t1 - t0)
    simple_model.train(method, num)
    print("Training:", time.time() - t1)

    simple_model.predict()
    print("Acc:", simple_model.evaluate())
    simple_model.predict(prob=True)