Ejemplo n.º 1
0
def read_test_file_all(tokenizer, truncate=512):
    df = pd.read_csv(os.path.join(OLID_PATH, 'testset-levela.tsv'), sep='\t')
    df_a = pd.read_csv(os.path.join(OLID_PATH, 'labels-levela.csv'), sep=',')
    ids = np.array(df['id'].values)
    tweets = np.array(df['tweet'].values)
    label_a = np.array(df_a['label'].values)
    nums = len(df)

    # Process tweets
    tweets = process_tweets(tweets)

    df_b = pd.read_csv(os.path.join(OLID_PATH, 'labels-levelb.csv'), sep=',')
    df_c = pd.read_csv(os.path.join(OLID_PATH, 'labels-levelc.csv'), sep=',')
    label_data_b = dict(zip(df_b['id'].values, df_b['label'].values))
    label_data_c = dict(zip(df_c['id'].values, df_c['label'].values))
    label_b = [
        label_data_b[id] if id in label_data_b.keys() else 'NULL' for id in ids
    ]
    label_c = [
        label_data_c[id] if id in label_data_c.keys() else 'NULL' for id in ids
    ]

    token_ids = [
        tokenizer.encode(text=tweets[i],
                         add_special_tokens=True,
                         max_length=truncate) for i in range(nums)
    ]
    mask = np.array(get_mask(token_ids))
    lens = get_lens(token_ids)
    token_ids = np.array(pad_sents(token_ids, tokenizer.pad_token_id))

    return ids, token_ids, lens, mask, label_a, label_b, label_c
Ejemplo n.º 2
0
def task_b(filepath: str, tokenizer, truncate=512):
    nums, ids, tweets, _, label_b, _ = read_file(filepath)
    # Only part of the tweets are useful for task b

    useful = label_b != 'NULL'
    ids = ids[useful]
    tweets = tweets[useful]
    label_b = label_b[useful]

    nums = len(label_b)
    # Tokenize
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    token_ids = [
        tokenizer.encode(text=tweets[i],
                         add_special_tokens=True,
                         max_length=truncate) for i in range(nums)
    ]
    # Get mask
    mask = np.array(get_mask(token_ids))
    # Get lengths
    lens = get_lens(token_ids)
    # Pad tokens
    token_ids = np.array(pad_sents(token_ids, tokenizer.pad_token_id))

    return ids, token_ids, lens, mask, label_b
Ejemplo n.º 3
0
def task_a(filepath: str, tokenizer, truncate=512):
    nums, ids, tweets, label_a, _, _ = read_file(filepath)
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    token_ids = [
        tokenizer.encode(text=tweets[i],
                         add_special_tokens=True,
                         max_length=truncate) for i in range(nums)
    ]
    mask = np.array(get_mask(token_ids))
    lens = get_lens(token_ids)
    token_ids = np.array(pad_sents(token_ids, tokenizer.pad_token_id))

    return ids, token_ids, lens, mask, label_a
Ejemplo n.º 4
0
def plot():
    """
    Serve a plot of the network.
    """
    scale = int(request.args.get('scale') or '10')

    log = request.args.get('log') or 'false'
    if log.lower() in ['0', 'false', 'off', 'no']:
        log = False
    else:
        log = True

    drop = request.args.get('drop') or 'true'
    if drop.lower() in ['1', 'true', 'on', 'yes']:
        drop = True
    else:
        drop = False

    years = utils.get_years()
    G = utils.get_network(years)
    if len(G) < 1:
        return render_template('plot.html', result={})
    result = {'network_plot': utils.plot_network(G, years, scale=scale)}

    result['years_plot'] = utils.plot_bars(years,
                                           sort=True,
                                           drop=drop,
                                           log=log)

    lasts = utils.get_lasts()
    result['lasts_plot'] = utils.plot_bars(lasts, title="Current position")

    lens = utils.get_lens()
    result['lens_plot'] = utils.plot_bars(lens,
                                          title="Career length so far",
                                          lpos=0.5)

    return render_template('plot.html', result=result)
Ejemplo n.º 5
0
def read_test_file(task, tokenizer, truncate=512):
    df1 = pd.read_csv(os.path.join(OLID_PATH, 'testset-level' + task + '.tsv'),
                      sep='\t')
    df2 = pd.read_csv(os.path.join(OLID_PATH, 'labels-level' + task + '.csv'),
                      sep=',')
    ids = np.array(df1['id'].values)
    tweets = np.array(df1['tweet'].values)
    labels = np.array(df2['label'].values)
    nums = len(df1)

    # Process tweets
    tweets = process_tweets(tweets)

    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    token_ids = [
        tokenizer.encode(text=tweets[i],
                         add_special_tokens=True,
                         max_length=truncate) for i in range(nums)
    ]
    mask = np.array(get_mask(token_ids))
    lens = get_lens(token_ids)
    token_ids = np.array(pad_sents(token_ids, tokenizer.pad_token_id))

    return ids, token_ids, lens, mask, labels