def main(): usage = "%prog parsed.ids.jsonlist articles.csv output_dir" parser = OptionParser(usage=usage) #parser.add_option('-v', dest='vocab_size', default=1000, # help='Maximum number of words to keep: default=%default') parser.add_option( '-m', dest='min_df', default=3, help='Minimum occurrence count for context words: default=%default') #parser.add_option('-d', dest='max_depth', default=2, # help='Max depth in parse tree: default=%default') #parser.add_option('-p', dest='pos', default=None, # help='Filter by POS tag(s) (e.g. JJ): default=%default') #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False, # help='Keyword argument: default=%default') (options, args) = parser.parse_args() infile = args[0] csv_file = args[1] output_dir = args[2] #max_depth = int(options.max_depth) min_df = int(options.min_df) #pos = options.pos lines = fh.read_jsonlist(infile) df = pd.read_csv(csv_file, header=0, index_col=0) stopwords = {'mr.', 'ms.', 'mrs.', 'major', 'maj.'} # go through all documents and build a vocab of relevant tuple words word_counts, entity_contexts = process_lines(lines, stopwords) print(word_counts.most_common(n=30)) print("Size of full vocab = {:d}".format(len(word_counts))) vocab = [w for w, c in word_counts.items() if c >= min_df] vocab_size = len(vocab) print("Size of filtered vocab = {:d}".format(vocab_size)) vocab.sort() vocab_index = dict(zip(vocab, range(len(vocab)))) outlines = [] for doc_id, words in entity_contexts.items(): words = [word for word in words if word in vocab_index] if len(words) > 2: event_name = df.loc[doc_id, 'title'] outlines.append({ 'id': doc_id, 'text': ' '.join(words), 'event_name': event_name, 'name': event_name + '_' + str(doc_id) }) fh.write_jsonlist(outlines, os.path.join(output_dir, 'tuples.jsonlist'))
def main(): usage = "%prog msa_db.csv data_dir output_file.jsonlist" parser = OptionParser(usage=usage) #parser.add_option('--keyword', dest='key', default=None, # help='Keyword argument: default=%default') #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False, # help='Keyword argument: default=%default') (options, args) = parser.parse_args() msa_db = args[0] data_dir = args[1] output_filename = args[2] articles = [] exclude = [ 'murderpedia.org', 'www.gunviolencearchive.org', 'www.fbi.gov', 'en.wikipedia.org', 'www.history.com', 'web.archive.org' ] df = pd.read_csv(msa_db, header=0) index = df.index for i in index: row = df.loc[i] caseid = row['CaseID'] title = row['Title'] names = row['Shooter Name'].split() #subdirs = glob.glob(os.path.join(data_dir, '*_*')) subdir = os.path.join(data_dir, str(caseid) + '_' + '_'.join(names)) if not os.path.exists(subdir): files = glob.glob( os.path.join(data_dir, str(caseid) + '_*', '*.json')) else: files = glob.glob(os.path.join(subdir, '*.json')) print(subdir, len(files)) for f in files: data = fh.read_json(f) text = data['text'] url = data['url'] parts = url.split('/') domain = parts[2] if len(text) > 200: if domain not in exclude: articles.append({ 'id': str(i), 'caseid': str(caseid), 'event_name': title, 'text': text }) fh.write_jsonlist(articles, output_filename, sort_keys=False)
def main(): usage = "%prog input.jsonlist ouput_dir [labels.csv covariates.csv ...]" parser = OptionParser(usage=usage) parser.add_option('--test_prop', dest='test_prop', default=0.2, help='proportion of documents to use for test data: default=%default') parser.add_option('--train', dest='train', default='train', help='output prefix for training data: default=%default') parser.add_option('--test', dest='test', default='test', help='output prefix for test data: default=%default') (options, args) = parser.parse_args() infile = args[0] output_dir = args[1] if len(args) > 2: csv_files = args[2:] else: csv_files = [] test_prop = float(options.test_prop) train_prefix = options.train test_prefix = options.test print("Reading", infile) items = fh.read_jsonlist(infile) n_items = len(items) n_test = int(n_items * test_prop) print("Creating random test set of %d items" % n_test) n_train = n_items - n_test train_indices = np.random.choice(np.arange(n_items), size=n_train, replace=False) test_indices = list(set(range(n_items)) - set(train_indices)) train_items = [items[i] for i in train_indices] test_items = [items[i] for i in test_indices] fh.write_jsonlist(train_items, os.path.join(output_dir, train_prefix + '.jsonlist')) fh.write_jsonlist(test_items, os.path.join(output_dir, test_prefix + '.jsonlist')) for file in csv_files: print(file) basename = os.path.basename(file) df = pd.read_csv(file, header=0, index_col=0) train_df_index = [df.index[i] for i in train_indices] train_df = df.loc[train_df_index] train_df.to_csv(os.path.join(output_dir, train_prefix + '.' + basename)) test_df_index = [df.index[i] for i in test_indices] test_df = df.loc[test_df_index] test_df.to_csv(os.path.join(output_dir, test_prefix + '.' + basename))
def preprocess_data(csv_file, parsed_dir, output_dir, output_prefix, parse_prefix): df = pd.read_csv(csv_file, header=0, index_col=0) n_rows, n_columns = df.shape print(df.shape) files = glob.glob(os.path.join(parsed_dir, '*.json')) n_files = len(files) #assert n_files == n_rows coref_input = [] pos_tags_all = set() print("Parsing %d documents" % n_files) for i in range(n_files): if i % 1000 == 0 and i > 0: print(i) valid = df.loc[i, 'matching'] name = str(df.loc[i, 'shooter_names']) # fix an important name error name = re.sub('Marteen', 'Mateen', name) names = name.split() age = str(df.loc[i, 'age']) if valid: filename = os.path.join(parsed_dir, parse_prefix + '_' + str(i) + '.json') parse = fh.read_json(filename) # get the text and convert to tokens sentences, lemmas, pos_tags, speakers, dependencies, target_mentions, age_pos_tags = process_parse(parse, names, age) pos_tags_all.update(age_pos_tags) # write output for e2e-coref coref_input.append({"id": i, "clusters": [], "doc_key": "nw", "sentences": sentences, "lemmas": lemmas, "speakers": speakers, "pos_tags": pos_tags, "dependencies": dependencies, "coref": [target_mentions] }) print(i, names, age, len(target_mentions)) fh.write_jsonlist(coref_input, os.path.join(output_dir, output_prefix + '.parsed.jsonlist'))
def download_articles(name, categories, subset): data = [] print("Downloading articles") newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=()) for i in range(len(newsgroups_data['data'])): line = newsgroups_data['data'][i] data.append({'text': line, 'group': newsgroups_data['target_names'][newsgroups_data['target'][i]]}) print(len(data)) raw_data_dir = os.path.join('data', '20ng', name) print("Saving to", raw_data_dir) fh.makedirs(raw_data_dir) fh.write_jsonlist(data, os.path.join(raw_data_dir, subset + '.jsonlist'))
def preprocess(self): """Preprocess the raw data file""" if self._check_processed_exists(): return train_lines = [] test_lines = [] unlabeled_lines = [] print("Opening tar file") # read in the raw data tar = tarfile.open(os.path.join(self.root, self.raw_filename), "r:gz") # process all the data files in the archive print("Processing documents") for m_i, member in enumerate(tar.getmembers()): # Display occassional progress if (m_i + 1) % 5000 == 0: print("Processed {:d} / 100000".format(m_i+1)) # get the internal file name parts = member.name.split(os.sep) if len(parts) > 3: split = parts[1] # train or test label = parts[2] # pos, neg, or unsup name = parts[3].split('.')[0] doc_id, rating = name.split('_') doc_id = int(doc_id) rating = int(rating) # read the text from the archive f = tar.extractfile(member) bytes = f.read() text = bytes.decode("utf-8") # tokenize it using spacy if label != 'unsup': # save the text, label, and original file name doc = {'id': split + '_' + str(doc_id), 'text': text, 'sentiment': label, 'orig': member.name, 'rating': rating} if split == 'train': train_lines.append(doc) elif split == 'test': test_lines.append(doc) else: doc = {'id': 'unlabeled_' + str(doc_id), 'text': text, 'sentiment': None, 'orig': member.name, 'rating': rating} unlabeled_lines.append(doc) print("Saving processed data to {:s}".format(self.root)) fh.write_jsonlist(train_lines, os.path.join(self.root, self.train_file)) fh.write_jsonlist(test_lines, os.path.join(self.root, self.test_file)) fh.write_jsonlist(unlabeled_lines, os.path.join(self.root, self.unlabeled_file))
def main(): usage = "%prog articles.jsonlist metadata.csv output_dir" parser = OptionParser(usage=usage) #parser.add_option('--keyword', dest='key', default=None, # help='Keyword argument: default=%default') #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False, # help='Keyword argument: default=%default') (options, args) = parser.parse_args() infile = args[0] meta_file = args[1] output_dir = args[2] articles = fh.read_jsonlist(infile) df = pd.read_csv(meta_file, header=0, index_col=0) df.index = [str(i) for i in df.index] print(df.head()) victim_counts = [] fatality_counts = [] white = [] black = [] mental = [] outlines = [] for line_i, line in enumerate(articles): if line_i % 1000 == 0: print(line_i) caseid = int(line['caseid']) name = line['name'] if caseid == 156 or caseid == 168: # differentiate on name for two ids that have duplicates row = df[(df['CaseID'] == caseid) & (df['name'] == name)] else: # otherwise, just use the id row = df[df['CaseID'] == caseid] line['state'] = str(row['state'].values[0]) line['white'] = int(row['ekg_white']) line['black'] = int(row['ekg_white']) white.append(int(row['ekg_white'])) black.append(int(row['ekg_black'])) line['mental'] = int(row['mental']) mental.append(int(row['mental'])) line['fate'] = str(row['fate_at_scene'].values[0]) line['fatalities'] = int(row['ekg_white']) line['victims'] = int(row['ekg_white']) victim_counts.append(int(row['victims'])) fatality_counts.append(int(row['fatalities'])) outlines.append(line) fh.write_jsonlist(outlines, os.path.join(output_dir, 'articles.metadata.jsonlist'), sort_keys=False) ids = list(range(len(victim_counts))) victims_df = pd.DataFrame(victim_counts, index=ids, columns=['victims']) victims_df.to_csv(os.path.join(output_dir, 'train.victims.csv')) fatalities_df = pd.DataFrame(fatality_counts, index=ids, columns=['fatalities']) fatalities_df.to_csv(os.path.join(output_dir, 'train.fatalities.csv')) white_df = pd.DataFrame(white, index=ids, columns=['white']) white_df.to_csv(os.path.join(output_dir, 'train.white.csv')) black_df = pd.DataFrame(black, index=ids, columns=['black']) black_df.to_csv(os.path.join(output_dir, 'train.black.csv')) mental_df = pd.DataFrame(mental, index=ids, columns=['mental']) mental_df.to_csv(os.path.join(output_dir, 'train.mental.csv'))
def main(): usage = "%prog data_dir output_dir output_prefix" parser = OptionParser(usage=usage) #parser.add_option('--year', dest='year', default=1987, # help='Year: default=%default') parser.add_option('--gzip', action="store_true", dest="gzip", default=False, help='gzip output: default=%default') #parser.add_option('--word2vec', action="store_true", dest="word2vec", default=False, # help='Output data processed for word2vec: default=%default') parser.add_option('--lower', action="store_true", dest="lower", default=False, help='Lower-case words: default=%default') parser.add_option('--replace_digits', action="store_true", dest="replace_digits", default=False, help='Replace digits with #: default=%default') parser.add_option('--fix_punct', action="store_true", dest="fix_punct", default=False, help='Fix some punctuation: default=%default') #parser.add_option('--timestamp', dest='timestamp', default=None, # help='List of words to timestamp (comma-separated): default=%default') (options, args) = parser.parse_args() base_dir = args[0] outdir = args[1] output_prefix = args[2] years = [str(year) for year in range(1987, 2008)] do_gzip = options.gzip word2vec = False lower = options.lower replace_digits = options.replace_digits fix_punct = options.fix_punct #words_to_timestamp = options.timestamp #if words_to_timestamp is not None: # words_to_timestamp = words_to_timestamp.split(',') outfile = None if word2vec: outfile = os.path.join(outdir, output_prefix + '.txt') if os.path.exists(outfile): sys.exit("Error: output file already exists.") with codecs.open(outfile, 'w') as f: f.write('') n_words = 0 outlines = [] for year in years: data_dir = os.path.join(base_dir, year) files = glob.glob(os.path.join(data_dir, '*.tgz')) files.sort() for tgz in files: print(tgz) tar = tarfile.open(tgz, "r:gz") for member in tar.getmembers(): #print(tgz, member.name) f = tar.extractfile(member) if f is not None: name = member.name parts = name.split('/') month = int(parts[0]) day = int(parts[1]) #print(tgz, member, f) xml_string = f.read() root = et.fromstring(xml_string) headlines = [] paragraphs = [] for body in root.findall('body'): #print(body) for head in body.findall('body.head'): #print(content) for headline in head.findall('hedline'): for child in headline: if child.text is not None: if 'class' not in child.attrib: headlines.append(child.text) for content in body.findall('body.content'): #print(content) for block in content.findall('block'): #print(block) if block.attrib['class'] == 'full_text': for child in block: if child.text is not None: if child.text[:5] != 'LEAD:': paragraphs.append(child.text) if len(paragraphs) > 0: try: if word2vec: if len(headlines) > 0: for headline in headlines: lines = headline.split('\n') for line in lines: n_words += len(line.split()) outlines.append(line) for paragraph in paragraphs: lines = paragraph.split('\n') for line in lines: n_words += len(line.split()) outlines.append(line) else: headline = '\n\n'.join(headlines) body = '\n\n'.join(paragraphs) headline = fix_line(headline, lower, replace_digits, fix_punct) body = fix_line(body, lower, replace_digits, fix_punct) outlines.append({ 'body': body, 'headline': headline, 'year': year, 'month': month, 'day': day }) except: print(tgz, member.name) print(headlines) print(paragraphs) print(year) print(month) print(day) sys.exit() if word2vec: output_line = '' for line in outlines: output_line += line + '\n' output_line = fix_line(output_line, lower, replace_digits, fix_punct) #if words_to_timestamp is not None: # for word in words_to_timestamp: # output_line = re.sub(word, word + '_' + str(year), output_line) with codecs.open(outfile, 'a') as f: f.write(output_line) else: outfile = os.path.join(outdir, output_prefix + '_' + year + '.jsonlist') if do_gzip: outfile += '.gz' fh.write_jsonlist(outlines, outfile, sort_keys=False, do_gzip=do_gzip) print("Total tokens = %d" % n_words)
'stance': s['Stance'], 'body_id': s['Body ID'], 'body': bodies[s['Body ID']] } for s in stances] return data if __name__ == '__main__': random.seed(3000) split = 0.8 train_stances = 'fnc-1/train_stances.csv' train_bodies = 'fnc-1/train_bodies.csv' test_stances = 'fnc-1/competition_test_stances.csv' test_bodies = 'fnc-1/competition_test_bodies.csv' train_data = 'data/train_data.csv' dev_data = 'data/dev_data.csv' test_data = 'data/test_data.csv' train = merge_data(train_stances, train_bodies) random.shuffle(train) split_ind = int(len(train) * split) train, dev = train[:split_ind], train[split_ind:] test = merge_data(test_stances, test_bodies) fh.write_jsonlist(train, train_data, sort_keys=True) fh.write_jsonlist(dev, dev_data, sort_keys=True) fh.write_jsonlist(test, test_data, sort_keys=True)
def main(): usage = "%prog parsed.ids.jsonlist articles.csv output_dir" parser = OptionParser(usage=usage) #parser.add_option('-v', dest='vocab_size', default=1000, # help='Maximum number of words to keep: default=%default') parser.add_option( '-m', dest='min_df', default=3, help='Minimum occurrence count for context words: default=%default') parser.add_option('-d', dest='max_depth', default=2, help='Max depth in parse tree: default=%default') parser.add_option('-p', dest='pos', default=None, help='Filter by POS tag(s) (e.g. JJ): default=%default') #parser.add_option('--filter', action="store_true", dest="filter", default=False, # help='Filter out unknown mental: default=%default') (options, args) = parser.parse_args() infile = args[0] csv_file = args[1] output_dir = args[2] max_depth = int(options.max_depth) min_df = int(options.min_df) pos = options.pos #filter = options.filter lines = fh.read_jsonlist(infile) df = pd.read_csv(csv_file, header=0, index_col=0) stopwords = set() # go through all documents and build a vocab of relevant tuple words search_terms = ['mental', 'terrorism'] word_counts, entity_contexts, words_found = process_lines( lines, stopwords, search_terms, max_depth=max_depth, pos=pos) print(word_counts.most_common(n=30)) print("Size of full vocab = {:d}".format(len(word_counts))) vocab = [w for w, c in word_counts.items() if c >= min_df] vocab_size = len(vocab) print("Size of filtered vocab = {:d}".format(vocab_size)) vocab.sort() vocab_index = dict(zip(vocab, range(len(vocab)))) outlines = [] for doc_id, words in entity_contexts.items(): # filter out duplicates words = [word for word in words if word in vocab_index] if len(words) > 2: event_name = df.loc[doc_id, 'title'] text = ' '.join(words) outline = {'id': doc_id, 'text': text, 'event_name': event_name} outline['name'] = event_name + '_' + str(doc_id) outline['simple_race'] = df.loc[doc_id, 'simple_race'] outline['white'] = int(df.loc[doc_id, 'white']) for term in search_terms: if words_found[doc_id][term] > 0: outline[term] = 1 else: outline[term] = 0 #if filter: # if outline['mental'] != 'Unknown': # outlines.append(outline) #else: outlines.append(outline) """ all_events = {} for doc_id, words in entity_contexts.items(): # filter out duplicates words = [word for word in words if word in vocab_index] event_name = df.loc[doc_id, 'title'] if event_name in all_events: all_events[event_name]['words'] = all_events[event_name]['words'] + words else: all_events[event_name] = {'id': doc_id, 'words': words, 'event_name': event_name, 'name': event_name + '_' + str(doc_id)} outlines = [] for key, value in all_events.items(): if len(value['words']) > 2: outlines.append({'id': value['id'], 'text': ' '.join(value['words']), 'event_name': key}) """ fh.write_jsonlist(outlines, os.path.join(output_dir, 'contexts.jsonlist'))
def preprocess_data(csv_file, parsed_dir, output_dir, output_prefix, parse_prefix): df = pd.read_csv(csv_file, header=0, index_col=0) n_rows, n_columns = df.shape print(df.shape) files = glob.glob(os.path.join(parsed_dir, '*.json')) n_files = len(files) #assert n_files == n_rows coref_input = [] pos_tags_all = set() print("Parsing %d documents" % n_files) #for i in range(n_files): for i in range(n_files): if i % 1000 == 0 and i > 0: print(i) valid = df.loc[i, 'matching'] name = str(df.loc[i, 'shooter_names']) # fix an important name error name = re.sub('Marteen', 'Mateen', name) names = name.split() age = str(df.loc[i, 'age']) event_name = 'msa-' + re.sub('\s', '-', df.loc[i, 'title']) msa_index = int(df.loc[i, 'df_index']) if msa_index == 272: # Kalamzoo duplicate print("Skipping", i, event_name) elif msa_index == 276: # Belfair duplicate print("Skipping", i, event_name) elif msa_index == 293: # Sherman, Texas duplicate print("Skipping", i, event_name) elif msa_index == 280: # Chelsea, MA duplicate print("Skipping", i, event_name) elif msa_index == 283: # Kansas City duplicate print("Skipping", i, event_name) elif msa_index == 331: # Cape Coral print("Skipping", i, event_name) elif valid: filename = os.path.join(parsed_dir, parse_prefix + '_' + str(i) + '.json') parse = fh.read_json(filename) # get the text and convert to tokens sentences, sentences_tagged, target_mentions, pos_tags, dependencies = process_parse(parse, names, age, event_name) sentences_pruned = [] for sent in sentences_tagged: tokens = [token for token in sent if token != '__DROP__'] sentences_pruned.append(' '.join(tokens)) text_pruned = ' '.join(sentences_pruned) # write output for e2e-coref coref_input.append({"id": i, "clusters": [], "doc_key": "nw", "sentences": sentences, "text_tagged": text_pruned, "pos_tags": pos_tags, "dependencies": dependencies, "coref": [target_mentions] }) print(i, names, age, len(target_mentions)) fh.write_jsonlist(coref_input, os.path.join(output_dir, output_prefix + '.parsed.jsonlist'))
def append_sims(split_data, filepath, namespace): with open(filepath, 'rb') as f: data = pickle.load(f) data = [d.item() for d in data] assert len(split_data) == len(data) for ex, d, in tqdm(zip(split_data, data), total=len(data)): ex[namespace] = d train_data = fh.read_jsonlist('train_data.csv') dev_data = fh.read_jsonlist('dev_data.csv') test_data = fh.read_jsonlist('test_data.csv') append_data(train_data, 'train_bodies_senti.pkl', 'body_senti') append_data(dev_data, 'dev_bodies_senti.pkl', 'body_senti') append_data(test_data, 'test_bodies_senti.pkl', 'body_senti') append_data(train_data, 'train_headline_senti.pkl', 'headline_senti') append_data(dev_data, 'dev_headline_senti.pkl', 'headline_senti') append_data(test_data, 'test_headline_senti.pkl', 'headline_senti') append_sims(train_data, 'train_cos_sims.pkl', 'cos_sim') append_sims(dev_data, 'dev_cos_sims.pkl', 'cos_sim') append_sims(test_data, 'test_cos_sims.pkl', 'cos_sim') fh.write_jsonlist(train_data, 'train_data.jsonl') fh.write_jsonlist(dev_data, 'dev_data.jsonl') fh.write_jsonlist(test_data, 'test_data.jsonl')