def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('q_model', help='Path to Q.joblib (deep-deep link model)')
    arg('data', help='Path to jl.gz file in CDR format')
    arg('output_folder', help='Where to put html output files N.html')
    arg('--offset', type=int, default=0, help='0-based start index in data')
    arg('--limit',
        type=int,
        default=100,
        help='Number of documents to analyze')
    args = parser.parse_args()

    output_path = Path(args.output_folder)
    output_path.mkdir(exist_ok=True)
    q_model = joblib.load(args.q_model)
    assert not q_model.get('page_vectorizer'), 'TODO'
    le = DictLinkExtractor()
    styles = format_html_styles()

    with json_lines.open(args.data, broken=True) as items:
        items = islice(items, args.offset, None)
        if args.limit:
            items = islice(items, args.limit)
        with multiprocessing.Pool() as pool:
            for idx, expls in enumerate(
                    pool.imap(partial(links_expls, q_model, le), items)):
                expls.sort(reverse=True)
                (output_path.joinpath(
                    '{}.html'.format(idx + args.offset)).write_text(
                        styles + '\n'.join(expl for _, expl in expls)))
def convert_item_set_to_dict(file_dir):
    """Converts a dataset file into dict of item-information"""

    item_info = {
        "item_id": [],
        "title": [],
        "price": [],
        "category_id": [],
        "product_id": [],
        "domain_id": [],
        "condition": []
    }

    with json_lines.open(file_dir) as f:
        for item in enumerate(f):
            item_info['item_id'].append(item[1]['item_id'])
            item_info['title'].append(item[1]['title'])
            item_info['price'].append(item[1]['price'])
            item_info['category_id'].append(item[1]['category_id'])
            item_info['product_id'].append(item[1]['product_id'])
            item_info['domain_id'].append(item[1]['domain_id'])
            item_info['condition'].append(item[1]['condition'])

    print("Finished reading file, proceding Dataframe")
    return item_info
Example #3
0
def test_reader_broken_json_partial(tmpdir):
    # with broken=True broken json lines are skipped, but reading continues
    p = tmpdir.join('myfile.jl')
    p.write_binary(b'{"a": 1}\n{"a": 2\n{"b": 1}\n')
    with json_lines.open(str(p), broken=True) as f:
        lines = list(f)
    assert lines == [{'a': 1}, {'b': 1}]
Example #4
0
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('input', help='.jl or .jl.gz file in CDRv2 format')
    arg('output', help='path to .jl or .jl.gz output in CDRv3 format')
    arg('--broken',
        action='store_true',
        help='specify if input might be broken (incomplete)')
    args = parser.parse_args()
    assert args.input != args.output

    with json_lines.open(args.input, broken=args.broken) as f:
        opener = gzip.open if args.output.endswith('.gz') else open
        with opener(args.output, 'wt') as outf:
            for v2_item in f:
                dt = datetime.fromtimestamp(v2_item['timestamp'] / 1e3)
                timestamp_crawl = format_timestamp(dt)
                assert v2_item['version'] == 2.0
                v3_item = CDRItem(
                    _id=format_id(v2_item['url'], timestamp_crawl),
                    crawler=v2_item['crawler'],
                    team=v2_item['team'],
                    timestamp_crawl=timestamp_crawl,
                    version=3.0,
                    url=v2_item['url'],
                    raw_content=v2_item['raw_content'],
                    content_type=v2_item['content_type'],
                    response_headers={'content-type': v2_item['content_type']},
                )
                outf.write(json.dumps(dict(v3_item)))
                outf.write('\n')
Example #5
0
    def run(self, args, opts):
        if not args:
            raise UsageError()
        if len(args) == 1 and '*' in args[0]:
            # paths were not expanded (docker)
            filenames = glob.glob(args[0])
        else:
            filenames = args
        del args
        filtered_filenames = [
            f for f in filenames
            if re.match(r'[a-z0-9]{12}\.csv$', os.path.basename(f))
        ]
        filenames = filtered_filenames or filenames
        if not filenames:
            raise UsageError()

        response_logs = []
        for filename in filenames:
            with json_lines.open(filename) as f:
                response_logs.append(pd.DataFrame(f))
        print('Read data from {} files'.format(len(filenames)))

        all_rpms = [
            rpms for rpms in (
                get_rpms(name, rlog, step=opts.step, smooth=opts.smooth)
                for name, rlog in zip(filenames, response_logs))
            if rpms is not None
        ]
        if all_rpms:
            print_rpms(all_rpms, opts)

        print_scores(response_logs, opts)
Example #6
0
def main(args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('infile', help='Pages in .jl.gz')
    parser.add_argument(
        'out_prefix',
        help='Output prefix (two files are written: one with'
        'full data and one with meta: status, url, domain, lang)')
    args = parser.parse_args(args)

    with json_lines.open(args.infile, broken=True) as f:
        items_file = gzip.open(args.out_prefix + '.items.jl.gz', 'wt')
        meta_file = gzip.open(args.out_prefix + '.meta.jl.gz', 'wt')
        n_errors = 0
        with multiprocessing.Pool() as pool:
            for text_item in pool.imap_unordered(convert_item,
                                                 f,
                                                 chunksize=100):
                if text_item is None:
                    n_errors += 1
                else:
                    items_file.write(json.dumps(text_item))
                    items_file.write('\n')
                    meta_file.write(
                        json.dumps({
                            key: text_item[key]
                            for key in
                            ['url', 'domain', 'lang', 'status', 'mangled_url']
                        }))
                    meta_file.write('\n')

    print('Number of errors: {}'.format(n_errors))
def convert_set_to_dict(file_dir):
    """Converts a dataset file into user-item-tmps dict"""
    user_item_tmstmp = {
        "user_id": [],
        "item_id": [],
        "rating": [],
        "timestamp": []
    }

    with json_lines.open(file_dir) as f:
        for index, item in enumerate(f):

            # Save User Events
            for event in item['user_history']:
                if check_item_id(event['event_info']):
                    user_item_tmstmp['user_id'].append(index)
                    user_item_tmstmp['item_id'].append(event['event_info'])
                    user_item_tmstmp['rating'].append(1)
                    user_item_tmstmp['timestamp'].append(
                        datetime.datetime.strptime(event['event_timestamp'],
                                                   '%Y-%m-%dT%H:%M:%S.%f%z'))

            # Save User Item Tmpstmp Bought
            user_item_tmstmp['user_id'].append(index)
            user_item_tmstmp['item_id'].append(item['item_bought'])
            user_item_tmstmp['rating'].append(5)
            user_item_tmstmp['timestamp'].append(
                user_item_tmstmp['timestamp'][-1] +
                datetime.timedelta(hours=2))

    print("Finished reading file, proceding Dataframe")
    return user_item_tmstmp
def read_samples(in_file, out_dir, args):
    ''' 'LR' version stores [Left headline/description, Right headline/description, left/right index]'''
    left_labeled_data_LR = []
    right_labeled_data_LR = []
    ''' 'LCR' version stores 
    [Left headline/description, Center headline/description, Right headline/description, left/right index]'''
    left_labeled_data_LCR = []
    right_labeled_data_LCR = []
    '''Left_Center_desc_Right' version stores 
    [Left headline/description, Center description, Right headline/description, left/right index]'''
    left_labeled_data_Left_Center_Desc_Right = []
    right_labeled_data_Left_Center_Desc_Right = []

    LCR_data_nonpairs = []
    with json_lines.open(in_file) as f:
        for item in f:
            LCR_data_nonpairs = read_samples_unpaired(args.data_type,
                                                      item["articles"],
                                                      LCR_data_nonpairs)

            # left_labeled_data_LR, right_labeled_data_LR = read_samples_util_LR(args.data_type, item['articles'], left_labeled_data_LR, right_labeled_data_LR)

            # left_labeled_data_LCR, right_labeled_data_LCR = read_samples_util_LCR(args.data_type, item['articles'], left_labeled_data_LCR, right_labeled_data_LCR)

            # left_labeled_data_Left_Center_Desc_Right, right_labeled_data_Left_Center_Desc_Right = read_samples_util_Left_Center_Desc_Right("article_headline", item['articles'], left_labeled_data_Left_Center_Desc_Right, right_labeled_data_Left_Center_Desc_Right)

        Path(out_dir).mkdir(parents=True, exist_ok=True)

        pickle.dump(
            LCR_data_nonpairs,
            open(
                os.path.join(out_dir, args.data_type + "_LCR_nonpairs.pickle"),
                'wb'))
Example #9
0
def test_reader_broken_json_partial_gzipped(tmpdir):
    # For gzip files broken=True only means gzip recovery, inside a single
    # archive processing stops at first broken json line
    p = tmpdir.join('myfile.jl.gz')
    write_gz(p, b'{"a": 1}\n{"a": 2\n{"b": 1}\n')
    with json_lines.open(str(p), broken=True) as f:
        lines = list(f)
    assert lines == [{'a': 1}]
Example #10
0
def read_input(input_file):
    """This method reads the input file which is in gzip format"""

    logging.info("reading file {0}...this may take a while".format(input_file))
    max = 20000
    with json_lines.open(input_file, 'rb') as f:
        for item in f:
            if max > 0:
                #max=max-1
                yield item['s']
Example #11
0
def read_blind_file():
    x = []
    with json_lines.open('test_dataset_blind.jsonl') as reader:
        for line in reader:
            p = list()
            for instruction in line['instructions']:
                p.append(instruction)
            x.append(" ".join(p))

    return x
Example #12
0
def init_factions(factions):
    print "Searching factions..."
    with json_lines.open(settings.factions_jsonl) as reader:
        for item in reader:
            for faction in factions:
                if isinstance(faction, int):
                    if item['id'] == faction:
                        save_faction(item)
                else:
                    if item['name'] == faction:
                        save_faction(item)
Example #13
0
def read_file_opt():
    x = []
    opt = list()
    with json_lines.open('train_dataset.jsonl') as reader:
        for line in reader:
            p = list()
            for instruction in line['instructions']:
                p.append(instruction)
            x.append(" ".join(p))
            opt.append(line['opt'])

    return x, opt
Example #14
0
def read_file_comp():
    x = []
    comp = list()
    with json_lines.open('train_dataset.jsonl') as reader:
        for line in reader:
            p = list()
            for instruction in line['instructions']:
                p.append(instruction)
            x.append(" ".join(p))
            comp.append(line['compiler'])

    return x, comp
def read_samples_description_sep_headline(in_file, out_dir, args):
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    headline_desc_pair_data = []

    # desc_sep_headline_output_file = open(os.path.join(out_dir, "description_sep_headline.txt"), "w")
    newsdesc_sep_headline_output_file_prefix = "description_sep_headline"
    newsdesc_sep_ideologylabel_headline_output_file_prefix = "description_sep_ideologylabel_headline"
    allsides_desc_sep_LCR_headline_output_file_prefix = "allsides_desc_sep_LCR_headline"
    allsides_desc_news_desc_ideology_headline_output_file_prefix = "allsides_desc_news_desc_ideology_headline"

    newsdesc_sep_headline_data = []
    newsdesc_ideology_headline_data = []
    allsides_desc_sep_ideology_headline_data = []
    allsides_desc_news_desc_ideology_headline_data = []

    with json_lines.open(in_file) as f:
        for item in f:
            for article in item["articles"]:
                newsdesc_sep_headline_data.append(
                    "<BOS> " +
                    article["article_description"].lower().replace("\n", " ") +
                    " <SEP> " +
                    article["article_headline"].replace("\n", " ") + " <EOS>")

                ideology_label = "<" + article["political_spectrum"].upper(
                ) + ">"
                newsdesc_ideology_headline_data.append(
                    "<BOS> " +
                    article["article_description"].lower().replace("\n", " ") +
                    " <SEP> " + ideology_label +
                    article["article_headline"].replace("\n", " ") + " <EOS>")

            allsides_desc_sep_ideology_headline_data = get_allsides_desc_ideology_headline(
                allsides_desc_sep_ideology_headline_data, item)
            allsides_desc_news_desc_ideology_headline_data = get_allsides_desc_news_desc_ideology_headline(
                allsides_desc_news_desc_ideology_headline_data, item)

        write_to_text_file_util(newsdesc_sep_headline_data, \
            out_dir, \
            newsdesc_sep_headline_output_file_prefix)

        write_to_text_file_util(newsdesc_ideology_headline_data, \
            out_dir, \
            newsdesc_sep_ideologylabel_headline_output_file_prefix)

        write_to_text_file_util(allsides_desc_sep_ideology_headline_data, \
            out_dir, \
            allsides_desc_sep_LCR_headline_output_file_prefix)

        write_to_text_file_util(allsides_desc_news_desc_ideology_headline_data, \
            out_dir, \
            allsides_desc_news_desc_ideology_headline_output_file_prefix)
Example #16
0
def read_geo_peers():
    peer_ip_countries = dict()
    try:
        with json_lines.open(peer_address_geo_file) as f:
            #thing = json.load(f)
            for item in f:
                #print(item["collector"])
                if item["collector"] in ignore_multi_hop_collectors:
                    continue  #ignore multi-hop collectors for now
                col = item["collector"]
                #print (col)
                for monitor in item["peers"]:
                    if col != "caida":
                        ip_address = str(monitor["peer_address"])
                        if ip_address in peer_ip_countries:
                            continue  #ignore peers already read from a different collector
                        #checksum to determine if monitor is both full feed and confidence == 1
                        try:
                            ffeed = int(monitor["full_feed"])
                        except ValueError:
                            #print ("monitorFails full-feed " + str(monitor["peer_address"]))
                            continue  #no full feed value, ignore

                        try:
                            conf = int(monitor["confidence"])
                        except ValueError:
                            #print ("monitorFails confidence " + str(monitor["peer_address"]))
                            continue  #no confidence value, ignore

                        csum = conf + ffeed
                        if csum < 2:
                            #print ("monitorFails confidence or full feed " + str(monitor["peer_address"]))
                            continue  #ignore monitors that aren't both full feed and confidence 1
                    else:
                        ip_address = str(monitor["dns_name"])

                    try:
                        country = str(monitor["final_country"])
                    except KeyError or ValueError:
                        #print ("monitorFails final country " + str(monitor["peer_address"]))
                        continue  #no final country value, ignore
                    #print ("monitorPass " + str(monitor["peer_address"]))
                    peer_ip_countries[ip_address] = country
    except:
        sys.stderr.write("\n something went wrong opening " +
                         peer_address_geo_file + "\n")
        sys.exit()
    return peer_ip_countries
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('input', help='In .jl.gz format with html in "text" field')
    arg('html_field', help='Field name where html is stored')
    arg('output', help='Output in jl.gz format with text in "text" field')
    args = parser.parse_args()

    with json_lines.open(args.input, broken=True) as f, gzip.open(
            args.output, 'wt') as outf:
        with multiprocessing.Pool() as pool:
            for text_item in pool.imap_unordered(
                    partial(text_worker, html_field=args.html_field), f):
                if text_item is not None:
                    outf.write(json.dumps(text_item))
                    outf.write('\n')
Example #18
0
def extract_classifications(file_path, set_type):
    i = 0
    with json_lines.open(file_path) as reader:
        for obj in reader:
            classification = obj["truthMean"]
            # classification = 1 if obj["truthClass"] == "clickbait" else 0
            if set_type == "training":
                training_classifications.append(classification)
            else:
                test_classifications.append(classification)
                test_ids.append(obj["id"])

            # Only collect features for the number of samples specified (takes too long for all 17,000)
            i += 1
            if set_type == "training" and i == max_samples:
                break
Example #19
0
def load_system_stats_file(*, je, path, metrics_data, node, q):
    """
    Extract relevant data from a Xcalar system stats file.
    """

    with json_lines.open(path) as f:
        for dikt in f:
            for metric_id in metrics_data.ids_for_source(source="_SYSTEM_STATS"):
                mcfg = metrics_data.cfg_for_id(metric_id=metric_id).dikt
                if 'xy_expr' in mcfg:
                    points = je.extract_xy(xy_expr=mcfg.get('xy_expr'), dikt=dikt)
                elif 'key_expr' in mcfg and 'val_expr' in mcfg:
                    points = je.extract_kv(key_expr=mcfg.get('key_expr'),
                                           val_expr=mcfg.get('val_expr'), dikt=dikt)
                else:
                    raise ValueError("invalid metric config: {}".format(mcfg))
                put_points(node=node, metric_id=metric_id, points=points, q=q)
Example #20
0
def init_systems():
    print "Searching systems..."
    faction_ids = []
    sql = 'SELECT id FROM minor_factions WHERE name IN %s' % (in_clause(
        settings.monitored_factions))
    c = db.cursor()
    c.execute(sql)
    for row in c:
        faction_ids.append(row[0])
    print faction_ids
    with json_lines.open(settings.systems_jsonl) as reader:
        for item in reader:
            print item['name'], "                \r",
            for faction in item['minor_faction_presences']:
                if faction['minor_faction_id'] in faction_ids:
                    save_system(item)
                    print ""
def read_geo_peers():
    peer_ip_countries = dict()
    with json_lines.open(peer_address_geo_file) as f:
        #thing = json.load(f)
        for item in f:
            #print(item["collector"])
            if item["collector"] in ignore_multi_hop_collectors:
                continue  #ignore multi-hop collectors for now

            for monitor in item["peers"]:
                ip_address = str(monitor["peer_address"])
                if ip_address in peer_ip_countries:
                    continue  #ignore peers already read from a different collector
                #checksum to determine if monitor is both full feed and confidence == 1
                try:
                    ffeed = int(monitor["full_feed"])
                except ValueError:
                    print("monitorFails full-feed " +
                          str(monitor["peer_address"]))
                    continue  #no full feed value, ignore

                try:
                    conf = int(monitor["confidence"])
                except ValueError:
                    print("monitorFails confidence " +
                          str(monitor["peer_address"]))
                    continue  #no confidence value, ignore

                csum = conf + ffeed
                if csum < 2:
                    print("monitorFails confidence or full feed " +
                          str(monitor["peer_address"]))
                    continue  #ignore monitors that aren't both full feed and confidence 1

                try:
                    country = str(monitor["final_country"])
                except KeyError or ValueError:
                    print("monitorFails final country " +
                          str(monitor["peer_address"]))
                    continue  #no final country value, ignore
                print("monitorPass " + str(monitor["peer_address"]))
                peer_ip_countries[ip_address] = country
    return peer_ip_countries
Example #22
0
def load_data(train_jsonl_file, data_type):  # data_type=[train, dev, test]
    doc_num = 0
    qa_num = 0

    documents = []
    question_answer = {}
    dataset = {"documents": documents, 'question_answer': question_answer}
    with json_lines.open(train_jsonl_file) as f:
        for item in f:

            if 'header' in item:
                dataset['name'] = item['header']['dataset']

            if 'qas' in item:

                document = Document()
                document.context = item['context']
                document.context_token = item['context_tokens']
                qas = item['qas']

                for qa in qas:
                    qa_ = QA()

                    qa_.question = qa['question']
                    # qa_.id = qa['id'] some datasets don't have id
                    qa_.qid = qa['qid']
                    qa_.question_tokens = qa['question_tokens']
                    if data_type != 'test':
                        qa_.answers = qa['answers']
                        qa_.detected_answers = qa['detected_answers']
                        question_answer[qa['qid']] = qa['answers']
                    document.qas.append(qa_)
                    qa_num += 1

                documents.append(document)
                doc_num += 1

    logging.info("{}: {} documents, {} questions".format(
        train_jsonl_file, doc_num, qa_num))
    return dataset
Example #23
0
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('dmoz_urls_topics', help='In .csv.gz format')
    arg('dmoz_text', help='Items with url and text fields in .jl.gz format')
    arg('output', help='output file for fasttext training')
    args = parser.parse_args()

    with gzip.open(args.dmoz_urls_topics, 'rt') as f:
        topics_by_url = dict(csv.reader(f))

    with json_lines.open(args.dmoz_text) as f, open(args.output, 'wt') as outf:
        for item in f:
            topics = topics_by_url[item['url']]
            topics = topics.split('/')
            if topics[0] == 'Top':
                topics = topics[1:]
            topics = [t for t in topics if not (len(t) == 1 and t.isupper())]
            for topic in topics:
                outf.write('__label__{} '.format(topic))
            outf.write(to_single_line(item['text']))
            outf.write('\n')
def read_samples(in_file, international_keywords, domestic_keywords):

    domestic_articles = []
    international_articles = []

    with json_lines.open(in_file) as f:
        count = 0
        for item in f:
            international_flag = False
            count += 1
            if item['allsides_description']:
                international_flag = international_flag or check_keywords(
                    item['allsides_description'], international_keywords)

            for article in item['articles']:
                if article['article_description']:
                    international_flag = international_flag or check_keywords(
                        article['article_description'], international_keywords)

            if check_keywords(item['allsides_description'], domestic_keywords):
                international_flag = False

            for article in item['articles']:
                if article['article_description']:
                    if check_keywords(article['article_description'],
                                      domestic_keywords):
                        international_flag = False

            if not international_flag:
                item["type"] = "domestic"
                domestic_articles.append(item)
            else:
                item["type"] = "international"
                international_articles.append(item)
        # print(count)
        return domestic_articles, international_articles
Example #25
0
def plot(*args,
         ymin=None,
         ymax=None,
         xmin=None,
         xmax=None,
         params=False,
         max_points=200):
    """ Use in the notebook like this:
    plot('./runs/oc2', './runs/oc1', 'loss', 'valid_loss')
    """
    paths, keys = [], []
    for x in args:
        if x.startswith('.') or x.startswith('/'):
            if '*' in x:
                paths.extend(glob.glob(x))
            else:
                paths.append(x)
        else:
            keys.append(x)
    plt.figure(figsize=(12, 8))
    keys = keys or ['loss', 'valid_loss']

    ylim_kw = {}
    if ymin is not None:
        ylim_kw['ymin'] = ymin
    if ymax is not None:
        ylim_kw['ymax'] = ymax
    if ylim_kw:
        plt.ylim(**ylim_kw)

    xlim_kw = {}
    if xmin is not None:
        xlim_kw['xmin'] = xmin
    if xmax is not None:
        xlim_kw['xmax'] = xmax
    if xlim_kw:
        plt.xlim(**xlim_kw)
    for path in paths:
        path = Path(path)
        with json_lines.open(str(path.joinpath('train.log')),
                             broken=True) as f:
            events = list(f)
        if params:
            print(path)
            pprint(json.loads(path.joinpath('params.json').read_text()))
        for key in sorted(keys):
            xs, ys = [], []
            for e in events:
                if key in e:
                    xs.append(e['step'])
                    ys.append(e[key])
            if xs:
                if len(xs) > 2 * max_points:
                    indices = (np.arange(0, len(xs),
                                         len(xs) / max_points).astype(
                                             np.int32))
                    xs = np.array(xs)[indices[1:]]
                    ys = [
                        np.mean(ys[idx:indices[i + 1]])
                        for i, idx in enumerate(indices[:-1])
                    ]
                plt.plot(xs, ys, label='{}: {}'.format(path, key))
    plt.legend()
Example #26
0
def plot(*args,
         ymin=None,
         ymax=None,
         xmin=None,
         xmax=None,
         params=False,
         max_points=200,
         legend=True,
         title=None,
         print_keys=False,
         print_paths=False,
         plt=None,
         newfigure=True,
         x_scale=1):
    """
    Use in the notebook like this::

        %matplotlib inline
        from imet.utils import plot
        plot('./runs/oc2', './runs/oc1', 'loss', 'valid_loss')

    """
    import json_lines  # no available on Kaggle

    if plt is None:
        from matplotlib import pyplot as plt
    paths, keys = [], []
    for x in args:
        if x.startswith('.') or '/' in x:
            if '*' in x:
                paths.extend(glob.glob(x))
            else:
                paths.append(x)
        else:
            keys.append(x)
    if print_paths:
        print('Found paths: {}'.format(' '.join(sorted(paths))))
    if newfigure:
        plt.figure(figsize=(12, 8))
    keys = keys or ['loss', 'valid_loss']

    ylim_kw = {}
    if ymin is not None:
        ylim_kw['bottom'] = ymin
    if ymax is not None:
        ylim_kw['top'] = ymax
    if ylim_kw:
        plt.ylim(**ylim_kw)

    xlim_kw = {}
    if xmin is not None:
        xlim_kw['left'] = xmin
    if xmax is not None:
        xlim_kw['right'] = xmax
    if xlim_kw:
        plt.xlim(**xlim_kw)
    all_keys = set()
    for path in sorted(paths):
        path = Path(path)
        with json_lines.open(path / 'train.log', broken=True) as f:
            events = list(f)
        all_keys.update(k for e in events for k in e)
        for key in sorted(keys):
            xs, ys, ys_err = [], [], []
            for e in events:
                if key in e:
                    xs.append(e['step'] * x_scale)
                    ys.append(e[key])
                    std_key = key + '_std'
                    if std_key in e:
                        ys_err.append(e[std_key])
            if xs:
                if np.isnan(ys).any():
                    print('Warning: NaN {} for {}'.format(key, path))
                if len(xs) > 2 * max_points:
                    indices = (np.arange(0,
                                         len(xs) - 1,
                                         len(xs) / max_points).astype(
                                             np.int32))
                    xs = np.array(xs)[indices[1:]]
                    ys = _smooth(ys, indices)
                    if ys_err:
                        ys_err = _smooth(ys_err, indices)
                label = '{}: {}'.format(path, key)
                if label.startswith('_'):
                    label = ' ' + label
                if ys_err:
                    ys_err = 1.96 * np.array(ys_err)
                    plt.errorbar(xs,
                                 ys,
                                 yerr=ys_err,
                                 fmt='-o',
                                 capsize=5,
                                 capthick=2,
                                 label=label)
                else:
                    plt.plot(xs, ys, label=label)
                plt.legend()
    if newfigure:
        plt.grid()
    if legend:
        plt.legend()
    if title:
        plt.title(title)
    if print_keys:
        print('Found keys: {}'.format(', '.join(
            sorted(all_keys - {'step', 'dt'}))))
Example #27
0
def iter_html(path):
    with json_lines.open(path, broken=True) as lines:
        for line in lines:
            yield line['raw_content']
Example #28
0
def read_samples(file):
    data_dir = os.path.dirname(file)

    # headlines = {
    #     'left': [],
    #     'center': [],
    #     'right': []
    # }
    # descriptions = {
    #     'left': [],
    #     'center': [],
    #     'right': []
    # }
    headlines = []
    descriptions = []
    publisher_spectrum_map = {}
    label_map = {'left': 0, 'center': 1, 'right': 2}
    counts = {'left': 0, 'center': 0, 'right': 0}
    split_ratio = 0.9
    publisher_info_flag = False

    with json_lines.open(file) as f:
        for item in f:
            headline = " ".join(
                [val.lower() for val in item["article_headline"].split()])
            desc = " ".join(
                [val.lower() for val in item["article_description"].split()])
            # print(item['publisher'])
            news_publisher = None
            if item['publisher'] != None:
                news_publisher = " ".join(
                    [val.lower() for val in item["publisher"].split()])

            if item['publisher'] != None:
                if item['publisher'] not in publisher_spectrum_map:
                    publisher_spectrum_map[item['publisher']] = []

                publisher_spectrum_map[item['publisher']].append(
                    item["political_spectrum"].lower())

            if item["political_spectrum"].lower(
            ) != "" and item['publisher'] != None:
                if publisher_info_flag:
                    headlines.append([
                        news_publisher + ". " + headline,
                        label_map[item["political_spectrum"].lower()]
                    ])
                    descriptions.append([
                        news_publisher + ". " + desc,
                        label_map[item["political_spectrum"].lower()]
                    ])
                else:
                    headlines.append([
                        headline, label_map[item["political_spectrum"].lower()]
                    ])
                    descriptions.append(
                        [desc, label_map[item["political_spectrum"].lower()]])
                # headlines[item["political_spectrum"].lower()].append(news_publisher+". "+headline)
                # descriptions[item["political_spectrum"].lower()].append(news_publisher+". "+desc)
                # counts[item["political_spectrum"].lower()] += 1
            # except:
            # print("Val: ", item["political_spectrum"].lower())

        # train_headlines = []
        # test_headlines = []

        # train_descriptions = []
        # test_descriptions = []
        # for label in ["left", "center", "right"]:
        #     train_size = int(len(headlines[label])*split_ratio)
        #     for val in headlines[label][:train_size]:
        #         train_headlines.append([val, label_map[label]])

        #     for val in headlines[label][train_size:]:
        #         test_headlines.append([val, label_map[label]])

        #     train_size = int(len(descriptions[label])*split_ratio)
        #     for val in descriptions[label][:train_size]:
        #         train_descriptions.append([val, label_map[label]])

        #     for val in descriptions[label][train_size:]:
        #         test_descriptions.append([val, label_map[label]])

        if publisher_info_flag:
            pickle.dump(
                headlines,
                open(
                    os.path.join(
                        data_dir,
                        'article_headlines_train_with_publisher.pickle'),
                    'wb'))
            # pickle.dump(test_headlines, open(os.path.join(data_dir, 'article_headlines_test_with_publisher.pickle'), 'wb'))
            pickle.dump(
                descriptions,
                open(
                    os.path.join(
                        data_dir,
                        'article_descriptions_train_with_publisher.pickle'),
                    'wb'))
            # pickle.dump(test_descriptions, open(os.path.join(data_dir, 'article_descriptions_test_with_publisher.pickle'), 'wb'))
        else:
            pickle.dump(
                headlines,
                open(os.path.join(data_dir, 'article_headlines_train.pickle'),
                     'wb'))
            pickle.dump(
                descriptions,
                open(
                    os.path.join(data_dir,
                                 'article_descriptions_train.pickle'), 'wb'))

        for key, val in publisher_spectrum_map.items():
            # print(key, np.unique(np.array(val)))
            print(key, val)
            if len(np.unique(np.array(val))) > 1:
                print(key, np.unique(np.array(val)))
Example #29
0
                        talf = alf.split("(")[0].replace('-', '').strip()
                        det['affli'] += talf + "\n"
            details.append(det)
    else:
        for a in al:
            det = {}
            det['auth'] = a.split("(")[0].replace('-', '').strip()
            det['affli'] = afflistr.split("(")[0].replace('-', '').strip()
            details.append(det)
    return details


filename = 'output'
records = []

with json_lines.open(Ingredients.getOutputFilesPath() +
                     'wcd2019ltabstracts.jl') as f:
    for item in f:
        records.append(item)

with json_lines.open(Ingredients.getOutputFilesPath() +
                     'wcd2019lterabstracts.jl') as f:
    for item in f:
        records.append(item)

links = []
with json_lines.open(Ingredients.getOutputFilesPath() + 'abslblinks.jl') as f:
    for item in f:
        links.append(item)

rows = []
for item in records:
Example #30
0
# -*- coding: utf-8 -*-
import ujson
import elasticsearch
import json_lines

props = set()

with json_lines.open('index_dbpedia201510.json') as data_file:
    es = elasticsearch.Elasticsearch([{
        'host': 'localhost',
        'port': 9200
    }],
                                     timeout=60)

    if es.indices.exists(index="dbpedia201510"):
        es.indices.delete(index="dbpedia201510")

    body = '{"settings": {"index.auto_expand_replicas": "1-all","index.number_of_shards": 1}}'

    es.indices.create(index="dbpedia201510", body=body)

    for line in data_file:
        dico = ujson.loads(line, encoding='utf-8')

        for k, v in dico.items():
            props.update(v.keys())
            es.index(index="dbpedia201510",
                     doc_type="entity",
                     id=k,
                     body=v,
                     timeout='60s')