def read_test_file_all(tokenizer, truncate=512): df = pd.read_csv(os.path.join(OLID_PATH, 'testset-levela.tsv'), sep='\t') df_a = pd.read_csv(os.path.join(OLID_PATH, 'labels-levela.csv'), sep=',') ids = np.array(df['id'].values) tweets = np.array(df['tweet'].values) label_a = np.array(df_a['label'].values) nums = len(df) # Process tweets tweets = process_tweets(tweets) df_b = pd.read_csv(os.path.join(OLID_PATH, 'labels-levelb.csv'), sep=',') df_c = pd.read_csv(os.path.join(OLID_PATH, 'labels-levelc.csv'), sep=',') label_data_b = dict(zip(df_b['id'].values, df_b['label'].values)) label_data_c = dict(zip(df_c['id'].values, df_c['label'].values)) label_b = [ label_data_b[id] if id in label_data_b.keys() else 'NULL' for id in ids ] label_c = [ label_data_c[id] if id in label_data_c.keys() else 'NULL' for id in ids ] token_ids = [ tokenizer.encode(text=tweets[i], add_special_tokens=True, max_length=truncate) for i in range(nums) ] mask = np.array(get_mask(token_ids)) lens = get_lens(token_ids) token_ids = np.array(pad_sents(token_ids, tokenizer.pad_token_id)) return ids, token_ids, lens, mask, label_a, label_b, label_c
def task_b(filepath: str, tokenizer, truncate=512): nums, ids, tweets, _, label_b, _ = read_file(filepath) # Only part of the tweets are useful for task b useful = label_b != 'NULL' ids = ids[useful] tweets = tweets[useful] label_b = label_b[useful] nums = len(label_b) # Tokenize # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') token_ids = [ tokenizer.encode(text=tweets[i], add_special_tokens=True, max_length=truncate) for i in range(nums) ] # Get mask mask = np.array(get_mask(token_ids)) # Get lengths lens = get_lens(token_ids) # Pad tokens token_ids = np.array(pad_sents(token_ids, tokenizer.pad_token_id)) return ids, token_ids, lens, mask, label_b
def task_a(filepath: str, tokenizer, truncate=512): nums, ids, tweets, label_a, _, _ = read_file(filepath) # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') token_ids = [ tokenizer.encode(text=tweets[i], add_special_tokens=True, max_length=truncate) for i in range(nums) ] mask = np.array(get_mask(token_ids)) lens = get_lens(token_ids) token_ids = np.array(pad_sents(token_ids, tokenizer.pad_token_id)) return ids, token_ids, lens, mask, label_a
def plot(): """ Serve a plot of the network. """ scale = int(request.args.get('scale') or '10') log = request.args.get('log') or 'false' if log.lower() in ['0', 'false', 'off', 'no']: log = False else: log = True drop = request.args.get('drop') or 'true' if drop.lower() in ['1', 'true', 'on', 'yes']: drop = True else: drop = False years = utils.get_years() G = utils.get_network(years) if len(G) < 1: return render_template('plot.html', result={}) result = {'network_plot': utils.plot_network(G, years, scale=scale)} result['years_plot'] = utils.plot_bars(years, sort=True, drop=drop, log=log) lasts = utils.get_lasts() result['lasts_plot'] = utils.plot_bars(lasts, title="Current position") lens = utils.get_lens() result['lens_plot'] = utils.plot_bars(lens, title="Career length so far", lpos=0.5) return render_template('plot.html', result=result)
def read_test_file(task, tokenizer, truncate=512): df1 = pd.read_csv(os.path.join(OLID_PATH, 'testset-level' + task + '.tsv'), sep='\t') df2 = pd.read_csv(os.path.join(OLID_PATH, 'labels-level' + task + '.csv'), sep=',') ids = np.array(df1['id'].values) tweets = np.array(df1['tweet'].values) labels = np.array(df2['label'].values) nums = len(df1) # Process tweets tweets = process_tweets(tweets) # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') token_ids = [ tokenizer.encode(text=tweets[i], add_special_tokens=True, max_length=truncate) for i in range(nums) ] mask = np.array(get_mask(token_ids)) lens = get_lens(token_ids) token_ids = np.array(pad_sents(token_ids, tokenizer.pad_token_id)) return ids, token_ids, lens, mask, labels