def main(args):
    outdir = pathlib.Path(args.o)
    if outdir.exists():
        shutil.rmtree(outdir)
    outdir.mkdir()
    tmp_clusters_path = outdir / 'tmp_clusters.jsonl'
    if tmp_clusters_path.exists():
        tmp_clusters_path.unlink()

    # get article -> cluster mappings
    clusters = list(utils.read_jsonl(args.dataset))
    url_to_cluster_idxs, id_to_cluster_idx = get_article_to_cluster_mappings(
        clusters)

    # add articles from WCEP to clusters, using URLs
    add_wcep_articles_to_clusters(args.wcep_articles, url_to_cluster_idxs,
                                  clusters)

    # add articles from CommonCrawl to clusters, using IDs
    add_cc_articles_to_clusters(clusters, args.cc_articles, id_to_cluster_idx,
                                tmp_clusters_path)

    # split clusters into separate train/val/test files
    split_dataset(outdir, tmp_clusters_path)
    tmp_clusters_path.unlink()

    for fn in ['train.jsonl', 'val.jsonl', 'test.jsonl']:
        cleanup_clusters(outdir / fn, tmp_clusters_path)
Example #2
0
def main():
    twitter = Twitter()
    komoran = Komoran()
    mecab = Mecab()

    argv_dict = {'twitter': twitter, 'komoran': komoran, 'mecab': mecab}

    if len(sys.argv) < 2:
        print('please insert sys_argv \n',
              '1) tokenizer selection: twitter, komoran, mecab \n',
              '2) twitter_tokenizer_option: norm [no argv means True] \n',
              '3) twitter_tokenizer_option: stemming [no argv means True]')
    else:
        output_file_path = input('output_text_file_name: ')
        output_file_path = './data/' + output_file_path
        input_file_path = input('input_text_file_name: ')
        input_file_path = './data/' + input_file_path
        twitter_option = [bool(sys.argv[2]),
                          bool(sys.argv[3])] if len(
                              sys.argv) == 4 else [True, True]
        app_id_list, app_name_list, cate_list, rating_list, review_list = read_jsonl(
            input_file_path, key_ma=False)

        ma_list = []
        for review in tqdm(review_list,
                           desc='tokenizing',
                           total=len(review_list)):
            ma_tokens = get_pos(tokenizer=argv_dict[sys.argv[1]],
                                doc=review,
                                twi_norm=twitter_option[0],
                                twi_stem=twitter_option[1])
            ma_list.append(ma_tokens)

        save_jsonl(output_file_path, app_id_list, app_name_list, cate_list,
                   rating_list, ma_list)
def main():
    # load arguments
    args = parse_args()

    # define logging level and format
    level = logging.INFO
    if args.debug:
        level = logging.DEBUG

    logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=level)

    # create model depending on model type
    logging.info('Init WordEmbedding ...')
    if "tamperednews" in args.dataset:
        language = "en"
    else:  # news400
        language = "de"
    we = WordEmbedder(fasttext_bin_folder=args.fasttext,
                      language=language,
                      token_types=args.tokens)

    # create output dir
    if not os.path.exists(os.path.dirname(args.output)):
        os.makedirs(os.path.dirname(args.output))

    # read dataset
    dataset = utils.read_jsonl(args.dataset, dict_key='id')

    # check if output file already exists
    logging.info('Generate word embeddings ...')
    if os.path.isfile(args.output):
        mode = 'r+'
    else:
        mode = 'a'

    # save embeddings to h5
    with h5py.File(args.output, mode) as output_file:
        cnt_docs = 0

        for document in dataset.values():
            cnt_docs += 1
            if cnt_docs % 100 == 0:
                logging.info(f'{cnt_docs} documents processed')

            if document['id'] in output_file:
                logging.debug(f"{document['id']} already processed")
                continue

            if "text" not in document.keys():
                output_file[document['id']] = []
                continue

            output_file[document['id']] = we.generate_embeddings(
                document["text"])

    return 0
Example #4
0
def read_article_ids(path, max_cluster_size):
    id_to_collection = {}
    ids = set()
    for cluster in utils.read_jsonl(path):
        articles = cluster['cc_articles']
        if max_cluster_size != -1:
            articles = articles[:max_cluster_size]
        for a in articles:
            ids.add(a['id'])
            id_to_collection[a['id']] = cluster['collection']
    return ids, id_to_collection
Example #5
0
def main():
    # load arguments
    args = parse_args()

    # define logging level and format
    level = logging.INFO
    if args.debug:
        level = logging.DEBUG

    logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s",
                        datefmt="%Y-%m-%d %H:%M:%S",
                        level=level)

    # read urls from input file
    if args.type == "news":
        dataset = utils.read_jsonl(args.input, dict_key="id")
        if not os.path.exists(args.output):
            os.makedirs(args.output)
    else:
        dataset = utils.read_jsonl(args.input, dict_key="wd_id")

    image_urls = []
    for doc in dataset.values():
        if args.type == "news":
            outfile = os.path.join(args.output, doc["id"] + ".jpg")
            image_urls.append([outfile, doc["image_url"], args.maxsize])
        else:
            for image in doc["image_urls"]:
                outfile = os.path.join(args.output, doc["wd_id"],
                                       image["filename"])

                if not os.path.exists(os.path.dirname(outfile)):
                    os.makedirs(os.path.dirname(outfile))

                image_urls.append([outfile, image["url"], args.maxsize])

    # download images
    with multiprocessing.Pool(args.threads) as p:
        p.map(download_news_images, image_urls)

    return 0
Example #6
0
def Kmeans_clustering(load_file_path, vectorizer, n_cluster=None):
    app_id_list, app_name_list, cate_list, rating_list, ma_list = read_jsonl(
        load_file_path)
    # word_idx = load_json('./data/word_idx_json_by_app_id.txt')
    spaced_ma_list = ma_transform_by_spacing(ma_list)
    coo_matrix, words, _ = get_matrix(vectorizer, spaced_ma_list)

    if not n_cluster:
        n_cluster = len(set(cate_list))

    m = coo_matrix.toarray()
    model = MiniBatchKMeans(n_clusters=n_cluster,
                            verbose=0,
                            n_init=10,
                            batch_size=100)
    model.fit(m)

    return model, words, app_id_list, app_name_list, cate_list, rating_list, ma_list
def add_cc_articles_to_clusters(clusters, cc_path, id_to_cluster_idx,
                                tmp_clusters_path):
    print('adding articles from CommonCrawl to clusters')
    n_clusters = len(clusters)
    n_clusters_done = 0
    for i, a in enumerate(utils.read_jsonl(cc_path)):
        if i % 10000 == 0:
            print(
                f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done'
            )
        cluster_idx = id_to_cluster_idx[a['id']]
        c = clusters[cluster_idx]
        c.setdefault('cc_articles_filled', [])
        c['cc_articles_filled'].append(a)
        if len(c['cc_articles']) == len(c['cc_articles_filled']):
            utils.write_jsonl([c], tmp_clusters_path, mode='a')
            clusters[cluster_idx] = None
            n_clusters_done += 1
    print(
        f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done')
Example #8
0
    def __init__(self, image_dir, testset_path):
        self.image_dir = image_dir

        # Read dataset
        self.dataset = utils.read_jsonl(
            testset_path,
            keep_keys={
                "image_path": "image_path",
                "image_hash": "image_hash",
                "leaf_class_idx": "leaf_class_idx",
                "leaf_wd_id": "leaf_wd_id",
            },
        )

        # Build image transformation
        self.transform = torchvision.transforms.Compose([
            torchvision.transforms.ToPILImage(),
            torchvision.transforms.Resize(size=224),
            torchvision.transforms.CenterCrop(size=224),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])
Example #9
0
"""
Quick example of how to read outputs and make markdown table
"""
import pandas as pd
import json
import numpy as np
import flatten_dict

from utils import read_jsonl, early_stopping_metrics

lines = read_jsonl('outputs/grid_search_results.jsonl')

metrics = [
    early_stopping_metrics(l['metrics_runs'][0]) for l in lines
    if 'metrics_runs' in l
]
args = [pd.DataFrame([l['args']]) for l in lines if 'metrics_runs' in l]
runs = [pd.concat([a.T, m], 0).T for a, m in zip(args, metrics)]
df_runs = pd.concat(runs)
print('columns', df_runs.columns)

# choose only some of the cols
metrics = [
    'test_hard_metric.Accuracy',
    'test_hard_metric.Exact match',
    #    'test_hard_metrics.F1-Score',
    #    'test_hard_metrics.ROC AUC',
    'test_metric.Accuracy',
    'test_metric.Exact match',
    #    'test_metrics.F1-Score',
    #    'test_metrics.ROC AUC'
def eval(data_file, data_type, model_name):

    data = read_jsonl(data_file)

    return topk_eval(model_name, data, data_type, k=1)
Example #11
0
        paths['test']  = 'data/test_CNNDM_' + encoder + '.jsonl'
    return paths

path = get_data_path("test","bert")
print(path)
# # for name in path:
# #     assert exists(path[name])
# #     print(path[name])


datasets = MatchSumPipe(20, "bert").process_from_file(path)
print('Information of dataset is:')
print(datasets)
test_set = datasets.datasets['test']
device = int(0)
batch_size = 1

for cur_model in models:
    print('Current model is {}'.format(cur_model))

    # load model
    model = torch.load(join(save_path, cur_model))

    # configure testing
    dec_path, ref_path = get_result_path(save_path, cur_model)
    test_metric = MatchRougeMetric(data=read_jsonl(path['test']), dec_path=dec_path, 
                              ref_path=ref_path, n_total = len(test_set))
    tester = Tester(data=test_set, model=model, metrics=[test_metric], 
                     batch_size=batch_size, device=device, use_tqdm=False)
    tester.test()
def main():
    # load arguments
    args = parse_args()

    # define logging level and format
    level = logging.INFO
    absl.logging.set_stderrthreshold('info')
    absl.logging.set_verbosity('info')
    if args.debug:
        absl.logging.set_stderrthreshold('debug')
        absl.logging.set_verbosity('debug')
        level = logging.DEBUG

    logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=level)

    # check output file and create folder
    if not args.output.endswith('.h5'):
        logging.error('Output file should end with .h5. Extiting ...')
        return 0

    if not os.path.exists(os.path.dirname(args.output)):
        os.makedirs(os.path.dirname(args.output))

    if os.path.isfile(args.output):
        mode = 'r+'
    else:
        mode = 'a'

    # create model depending on model type
    FE = None
    if 'facenet' in args.model:
        logging.info(
            'Building network for embedding based on person verification ...')
        FE = FacialFeatureExtractor(model_path=args.model)
    elif 'location' in args.model:
        logging.info(
            'Building network for embedding based on geolocation estimation ...'
        )
        FE = GeoEstimator(model_path=args.model, use_cpu=args.use_cpu)
    elif 'scene' in args.model:
        logging.info(
            'Building network for embedding based on scene classification ...')
        FE = SceneClassificator(model_path=args.model)

    if not FE:
        logging.error('Unknown model. Exiting ...')
        return 0

    if 'scene' not in args.model and args.logits:
        logging.error(
            'Please specify a scene classification model to create scene logits.'
        )
        return 0

    # read dataset
    if args.type == 'news':
        dataset = utils.read_jsonl(args.input, dict_key='id')
    else:
        dataset = utils.read_jsonl(args.input, dict_key='wd_id')

    logging.info(f'{len(dataset.keys())} dataset entries to process')

    # create embeddings
    with h5py.File(args.output, mode) as output_file:
        for entry in dataset.values():
            images = []
            if args.type == 'news':
                fname = os.path.join(args.directory, entry['id'])
                images.append({'fname': fname, 'search_engine': None})
            else:  # entity
                for image in entry['image_urls']:
                    fname, _ = os.path.splitext(image['filename'])
                    fname = os.path.join(args.directory, entry['wd_id'], fname)
                    images.append({
                        'fname': fname,
                        'search_engine': image['search_engine']
                    })

            for image in images:
                image_files = glob.glob(image['fname'] +
                                        '.*')  # get image_file with extension
                if len(image_files) == 0:
                    logging.info(
                        f"Cannot find image file {image['fname']}.jpg")
                    continue
                else:
                    image_file = image_files[0]

                if args.type == 'news':
                    h5_key = entry['id']
                else:  # entity
                    h5_key = f"{entry['wd_id']}/{image['search_engine']}/{os.path.basename(image['fname'])}"

                if h5_key in output_file:
                    logging.info(
                        f'Embedding for {h5_key} already computed ...')
                    continue

                logging.info(f'Generate embedding for {h5_key} ...')

                img_embeddings = []
                if args.logits:
                    img_emb = FE.get_logits(image_file)
                else:
                    img_emb = FE.get_img_embedding(image_file)

                for e in img_emb:
                    img_embeddings.append(e)

                if len(img_embeddings) == 0:
                    logging.debug(f'No embedding found for {h5_key} ...')
                    output_file[h5_key] = []
                else:
                    # convert to np array and store to output file
                    id_img_embs = np.asarray(img_embeddings, dtype=np.float32)
                    output_file[h5_key] = id_img_embs

    return 0
Example #13
0
def main():
    jobs = list(read_jsonl("./emoji2.jsonl"))
    print(f"{len(jobs)} jobs to run.")
    with Pool() as pool:
        for result in pool.imap_unordered(run_job, jobs, chunksize=10):
            pass