def read_proxies(filepath): proxy_list = [] with jsonlines.open(filepath) as reader: for i, state in enumerate(reader): proxy_list.append(state) reader.close() return proxy_list
def read_states_from_jsonl(filename): """Reads and returns data from .jsonl file. Returns list of data""" states = [] with jsonlines.open(filename) as reader: for i, state in enumerate(reader): if (i == 0) or (i == 'None'): pass else: states.append(state) reader.close() return states
def generate_jsonlines(input, output): import jsonlines pn_file = open(input, 'rU') raw = json.load(pn_file) pn_file.close() hits = raw['hits']['hits'] obj = jsonlines.open(output, mode='w') for hit in hits: source = hit[WEBPAGE_SOURCE] obj.dump(source)
def load_intermediate_data(path, format='jsonlines'): dataset = [] if format == 'jsonlines': import jsonlines lines = jsonlines.open(path, mode='r') for line in lines: dataset.append([line['sid'], line['content']]) elif format == 'csv': import csv with open(path, 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: dataset.append(row) return dataset
def generate_intermediate_data(dataset, output_path, format='jsonlines'): if format == 'jsonlines': import jsonlines obj = jsonlines.open(output_path, mode='w') for data in dataset: # obj.dump([{'sid': data[0], 'content': data[1]}]) obj.dump({'sid': data[0], 'content': data[1]}) elif format == 'csv': import csv with open(output_path, 'wb') as csvfile: spamwriter = csv.writer(csvfile) for data in dataset: spamwriter.writerow(data)
def read_states_from_excel(): """Reads data from excel file and writes this data to .jsonl file. Returns file path.""" wb = load_workbook(filename='states/Top50CitiesinUSwithZipCodes.xlsx') ws = wb.get_active_sheet() with jsonlines.open('states/tmpStates.jsonl', 'w') as f: for row in ws.rows: tmp_list = [] for cell in row: tmp_list.append(str(cell.value)) f.write(tmp_list) f.close() return 'states/tmpStates.jsonl'
def startCollectProxies(): """Moving to site and collect proxies""" driver = webdriver.Firefox() driver.get("http://hideme.ru/proxy-list/?maxtime=250&type=45#list") with jsonlines.open('proxies/proxy.jsonl', 'w') as f: cont = driver.find_element_by_xpath('//tbody') for i, k in enumerate(cont.find_elements_by_xpath('//tr')): if i == 0: pass else: temp_dict = k.text f.write(conv_str(temp_dict)) f.close() time.sleep(1) driver.quit() return 'proxies/proxy.jsonl'
def get_proxy(html): soup = BeautifulSoup(html, 'lxml') table = soup.find('table', class_='proxy__t') with jsonlines.open('proxy/proxy.jsonl', 'w') as f: tbody = table.find('tbody') for i, tr in enumerate(tbody.find_all('tr')): pr_list = [] for k, td in enumerate(tr.find_all('td')): if (k == 0) or (k == 1): pr_list.append(td.text) elif k == 4: pr_list.append(conv_str(td.text)) else: pass f.write(pr_list) f.close() return 'proxy/proxy.jsonl'
def main(): root_path = os.path.join('data', 'datasets', 'sst') if not os.path.exists(root_path): os.makedirs(root_path) os.chdir(root_path) base_url = 'https://raw.githubusercontent.com/PrincetonML/SIF/master/data' filenames = ['sentiment-train', 'sentiment-dev', 'sentiment-test'] for filename in filenames: if not os.path.exists(filename): os.system('wget %s' % (base_url + '/' + filename)) spacy_nlp = spacy.load('en') for filename in filenames: print(filename) with codecs.open(filename, 'r', 'utf-8') as f, jsonlines.open(filename + '.jsonl', mode='w') as writer: lines = [l for l in f] progress = tqdm(lines, mininterval=1, leave=False) for line in progress: text, label = line.strip().split('\t') text_spacy = ' '.join([t.text for t in spacy_nlp(text)]) writer.write({'text': text_spacy, 'label': label})
def main(): description() page_count = get_page_count(get_html_proxy(BASE_URL)) print('Total pages: %s \n\n' % page_count) file_name = str(input("Name your output file, e.g. (ParsedData): ---> ")) t = input("Type timeouts in seconds between every 8 parsed doctors (15 e.g.): ---> ") start_page = input("Type page number, where are you want to start: ---> ") end_page = input("Type page number, where are you want to stop: ---> ") print("\n\n\n") if t == '': t = 20 elif int(t) < 1: t = 1 elif int(t) > 60: t = 60 else: t = int(t) if start_page == '': start_page = 0 else: start_page = int(start_page) if end_page == '': end_page = page_count else: end_page = int(end_page) print("Start collecting proxies...") proxy_list = read_proxies(get_proxy(get_html_proxy(PROXY_SITE))) print(proxy_list) print("Proxies had been collected!") print("Let\'s start parse:") pcounter = start_page for page in range(start_page, end_page + 1): doctors = [] if page == 0: link = BASE_URL else: link = BASE_URL + '/' + str(page) doctors.extend(parse(get_html(link))) with jsonlines.open('tmp/' + str(pcounter) + '.jsonl', 'w') as tf: for item in doctors: tf.write(item) tf.close() pcounter += 1 print('Parsed page: %s' % pcounter) print('Parsing process: %d%%' % ((page / end_page) * 100)) time.sleep(t) # Timeouts print("\n\n\nParsing end") print("\nStart saving parsed data...\n") w_into_file(file_name) print("Finish! Now you can to close this program. ;)\n") q = input("Type \'q\' and press ENTER to close this program :) ---> ") if q == 'q': quit()
string = re.sub(r"\s{2,}", " ", string) string = re.sub(r"@", "", string) return string.lower() MAX_SEQUENCE_LENGTH = 1000 MAX_NB_WORDS = 20000 EMBEDDING_DIM = 100 VALIDATION_SPLIT = 0.1111806 count = 0 full_count = 0 train_val_data = [] test_data = [] with jsonlines.open('instances.jsonl') as reader: for obj in reader.iter(type=dict, skip_invalid=True): count += 1 full_count += 1 if (count > 17600): test_data.append(obj) if (count <= 17600): train_val_data.append(obj) count = 0 truth_data = [] with jsonlines.open('truth.jsonl') as reader: for obj in reader.iter(type=dict, skip_invalid=True): truth_data.append(obj) run(train_val_data, test_data, truth_data)
def read_data(data_file): data = [] with jsonlines.open(data_file) as reader: for obj in reader: data.append(obj) return data
CHUNK_SIZE = 100000 # Optional limit after which to abort chunking. LIMIT = None # Create current writer for chunked segment. def create_current_writer(i): chunk_no = math.floor(i / CHUNK_SIZE) output_filename = OUT_DIR + '/ocdata.chunk' + str(chunk_no) + '.jsonl' current_writer = jsonlines.open(output_filename, mode='w') return current_writer print("Chunking data from input: " + SOURCE_PATH) i = 0 current_writer = create_current_writer(i) reader = jsonlines.open(SOURCE_PATH) for obj in tqdm(reader): if ((i != 0) and (i % CHUNK_SIZE == 0)): current_writer.close() current_writer = create_current_writer(i) # Stop on limit if any. if (LIMIT != None and i >= LIMIT): break # Add line number. obj['_line'] = i current_writer.write(obj) i = i + 1 reader.close() current_writer.close() print("Done chunking " + str(i) + " records.")
def print_hi(name): print(f'Hi, {name}') DATA_PATH_JSON = work_dir + "data/sarcasm_data.json" BERT_TARGET_EMBEDDINGS = work_dir + "data/bert-output.jsonl" INDICES_FILE = work_dir + "data/split_indices.p" AUDIO_PICKLE = work_dir + "data/audio_features.p" BATCH_SIZE = 32 model_path = os.path.join(work_dir + 'saved', f'lfdnn-mustard-M.pth') model_name = 'lf_dnn' RESULT_FILE = work_dir + "output/independent/ta/{}.json" device = torch.device('cuda:%d' % 0 if torch.cuda.is_available() else 'cpu') print(f'device: {device}') print() def pickle_loader(filename): if sys.version_info[0] < 3: return pickle.load(open(filename, 'rb')) else: return pickle.load(open(filename, 'rb'), encoding="latin1") class MMDataset(Dataset): def __init__(self, text_feature, video_feature_mean, audio_feature_mean, label_out): # print('MMDataset') self.vision = video_feature_mean self.text = text_feature self.audio = audio_feature_mean self.label = label_out # print('self.label') def __len__(self): return len(self.label) def __getitem__(self, index): # print('index') # print(index) sample = { 'text': torch.Tensor(self.text[index]), 'vision': torch.Tensor(self.vision[index]), 'audio': torch.Tensor(self.audio[index]), 'labels': torch.Tensor(self.label[index]).type(torch.LongTensor) } return sample dataset_json = json.load(open(DATA_PATH_JSON)) print(type(dataset_json), len(dataset_json)) # dict 690 print(list(dataset_json.keys())[:2]) tmp = list(dataset_json.keys())[:2] print(tmp[0]) tmp = dataset_json[tmp[0]] print(tmp) # text text_bert_embeddings = [] with jsonlines.open(BERT_TARGET_EMBEDDINGS) as reader: print('opend bert : ', BERT_TARGET_EMBEDDINGS) for obj in reader: CLS_TOKEN_INDEX = 0 features = obj['features'][CLS_TOKEN_INDEX] bert_embedding_target = [] for layer in [0, 1, 2, 3]: bert_embedding_target.append( np.array(features["layers"][layer]["values"])) bert_embedding_target = np.mean(bert_embedding_target, axis=0) # print(bert_embedding_target.shape) 768 text_bert_embeddings.append(np.copy(bert_embedding_target)) print('np.array(text_bert_embeddings).shape bert 768 ') print(np.array(text_bert_embeddings).shape) # 690 768 # video video_features_file = h5py.File( work_dir + 'data/features/utterances_final/resnet_pool5.hdf5') # combined feature index # audio dict (283 12) (283 11) audio_features = pickle_loader(AUDIO_PICKLE) TEXT_ID = 0 VIDEO_ID = 1 AUDIO_ID = 2 SHOW_ID = 3 # parse_data data_input, data_output = [], [] # data_input [(text,video)(text,video)] # text:768 vide0: frame:2048 for idx, ID in enumerate(dataset_json.keys()): # print(idx, 'processing ... ', ID) 0 processing ... 1_60 data_input.append(( text_bert_embeddings[idx], # 0 TEXT_ID video_features_file[ID][()], # 1 VIDEO_ID audio_features[ID], # 2 AUDIO_ID dataset_json[ID]["show"] # 2 SHOW_ID )) data_output.append(int(dataset_json[ID]["sarcasm"])) print('close video_features_file') video_features_file.close() splits = 5 skf = StratifiedKFold(n_splits=splits, shuffle=True) split_indices = [ (train_index, test_index) for train_index, test_index in skf.split(data_input, data_output) ] print('split_indices: ') # print(split_indices) print(split_indices[0][0].shape, split_indices[0][1].shape) print(len(split_indices)) # (552,)(138, ) # 5 if not os.path.exists(INDICES_FILE): pickle.dump(split_indices, open(INDICES_FILE, 'wb'), protocol=2) split_indices = pickle_loader(INDICES_FILE) print('after pickle_loader: ') print(split_indices[0][0].shape, split_indices[0][1].shape) print(len(split_indices)) def get_data_loader(train_ind_SI): dataLoader = None # (text,video,AUDIO) train_input = [data_input[ind] for ind in train_ind_SI] # [0 1 0 1 ] train_out = np.array([data_output[ind] for ind in train_ind_SI]) # expand dim (n,) (n,1) it may be useless for crossentropy train_out = np.expand_dims(train_out, axis=1) def getData(ID=None): return [instance[ID] for instance in train_input] # Text Feature train_text_feature = getData(TEXT_ID) # video Feature train_video_feature = getData(VIDEO_ID) train_video_feature_mean = np.array([ np.mean(feature_vector, axis=0) for feature_vector in train_video_feature ]) # audio Feature audio = getData(AUDIO_ID) # (552, 283) train_audio_feature = np.array( [np.mean(feature_vector, axis=1) for feature_vector in audio]) train_dataset = MMDataset(train_text_feature, train_video_feature_mean, train_audio_feature, train_out) train_dataLoader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=0, shuffle=True) dataLoader = train_dataLoader return dataLoader class SubNet(nn.Module): def __init__(self, in_size, hidden_size, dropout): super(SubNet, self).__init__() self.norm = nn.BatchNorm1d(in_size) self.drop = nn.Dropout(p=dropout) self.linear_1 = nn.Linear(in_size, hidden_size) self.linear_2 = nn.Linear(hidden_size, hidden_size) self.linear_3 = nn.Linear(hidden_size, hidden_size) def forward(self, x): normed = self.norm(x) dropped = self.drop(normed) y = F.relu(self.linear_1(dropped)) # y = self.linear_1(dropped) y = F.relu(self.linear_2(y)) y = F.relu(self.linear_3(y)) out = y return out # for audio only have 283 so small the model to avoid overfitting class SubAudioNet(nn.Module): def __init__(self, in_size, hidden_size, dropout): super(SubAudioNet, self).__init__() self.norm = nn.BatchNorm1d(in_size) self.drop = nn.Dropout(p=dropout) self.linear_1 = nn.Linear(in_size, hidden_size) self.linear_2 = nn.Linear(hidden_size, hidden_size) self.linear_3 = nn.Linear(hidden_size, hidden_size) def forward(self, x): normed = self.norm(x) dropped = self.drop(normed) y = F.relu(self.linear_1(dropped)) # y = self.linear_1(dropped) y = F.relu(self.linear_2(y)) y = F.relu(self.linear_3(y)) out = y return out class LF_DNN(nn.Module): def __init__(self): super(LF_DNN, self).__init__() self.text_in, self.video_in, self.audio_in = 768, 2048, 283 self.text_hidden, self.video_hidden, self.audio_hidden = 128, 128, 32 # self.text_out = 32 self.post_fusion_dim1 = 256 self.post_fusion_dim = 32 self.video_prob, self.text_prob, self.audio_prob, self.post_fusion_prob = ( 0.2, 0.2, 0.2, 0.2) self.video_subnet = SubNet(self.video_in, self.video_hidden, self.video_prob) self.audio_subnet = SubAudioNet(self.audio_in, self.audio_hidden, self.audio_prob) self.text_subnet = SubNet(self.text_in, self.text_hidden, self.text_prob) self.post_fusion_dropout = nn.Dropout(p=self.post_fusion_prob) self.post_fusion_dropout1 = nn.Dropout(p=self.post_fusion_prob) self.post_fusion_layer_1 = nn.Linear( self.text_hidden + self.audio_hidden + self.text_in + self.audio_in, self.post_fusion_dim1) self.post_fusion_layer_4 = nn.Linear(self.post_fusion_dim1, self.post_fusion_dim1) self.post_fusion_layer_5 = nn.Linear(self.post_fusion_dim1, self.post_fusion_dim1) # self.post_fusion_layer_2 = nn.Linear(self.post_fusion_dim1, self.post_fusion_dim) self.post_fusion_layer_6 = nn.Linear( self.post_fusion_dim1 + self.text_in + self.audio_in, self.post_fusion_dim1) self.post_fusion_layer_7 = nn.Linear(self.post_fusion_dim1, self.post_fusion_dim1) self.post_fusion_layer_8 = nn.Linear(self.post_fusion_dim1, self.post_fusion_dim) self.post_fusion_layer_3 = nn.Linear(self.post_fusion_dim, 2) def forward(self, text_x, video_x, audio_x): video_h = self.video_subnet(video_x) audio_h = self.audio_subnet(audio_x) text_h = self.text_subnet(text_x) # 128+32+16 = 176 fusion_h = torch.cat([text_h, audio_h, text_x, audio_x], dim=-1) x = self.post_fusion_dropout(fusion_h) # x = self.post_fusion_layer_1(x) x = F.relu(self.post_fusion_layer_1(x), inplace=True) x = F.relu(self.post_fusion_layer_4(x), inplace=True) x = F.relu(self.post_fusion_layer_5(x), inplace=True) # x = F.relu(self.post_fusion_layer_2(x), inplace=True) x = torch.cat([x, text_x, audio_x], dim=-1) x = self.post_fusion_dropout1(x) x = F.relu(self.post_fusion_layer_6(x), inplace=True) x = F.relu(self.post_fusion_layer_7(x), inplace=True) x = F.relu(self.post_fusion_layer_8(x), inplace=True) output = self.post_fusion_layer_3(x) return output # model = SubNet(2048,128,0.2) # model1 = SubNet(768,32,0.2) model2 = LF_DNN() model2.to(device) # summary(model,(2048,)) # summary(model1,(768,)) summary(model2, [(768, ), (2048, ), (283, )]) # summary(model, [(1, 16, 16), (1, 28, 28)]) learning_rate = 5e-4 weight_decay = 0.0 early_stop = 20 def do_test(model2, dataLoader, mode="VAL"): criterion = nn.CrossEntropyLoss() model2.eval() y_pred, y_true = [], [] eval_loss = 0.0 eval_acc = 0.0 with torch.no_grad(): with tqdm(dataLoader) as td: for batch_data in td: vision = batch_data['vision'].to(device) text = batch_data['text'].to(device) audio = batch_data['audio'].to(device) labels = batch_data['labels'].to(device) outputs = model2(text, vision, audio) loss = criterion(outputs, labels.squeeze()) eval_loss += loss.item() eval_acc += (outputs.argmax(1) == torch.squeeze( labels.long())).sum().item() y_pred.append(outputs.argmax(1).cpu()) y_true.append(labels.squeeze().long().cpu()) pred, true = torch.cat(y_pred), torch.cat(y_true) eval_loss = eval_loss / len(pred) eval_acc = eval_acc / len(pred) # print('len dataLoader:',len(dataLoader)) 1 print("%s-(%s) >> loss: %.4f acc: %.4f" % (mode, 'lf_dnn', eval_loss, eval_acc)) return eval_acc, pred, true def do_train(model2, train_dataLoader, val_dataLoader): best_acc = 0 epochs, best_epoch = 0, 0 criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model2.parameters(), lr=learning_rate, weight_decay=weight_decay) while True: epochs += 1 y_pred, y_true = [], [] model2.train() train_loss = 0.0 train_acc = 0.0 with tqdm(train_dataLoader) as td: for batch_data in td: vision = batch_data['vision'].to(device) audio = batch_data['audio'].to(device) text = batch_data['text'].to(device) labels = batch_data['labels'].to(device) optimizer.zero_grad() # forward outputs = model2(text, vision, audio) loss = criterion(outputs, labels.squeeze()) # backward loss.backward() # update optimizer.step() train_loss += loss.item() train_acc += (outputs.argmax(1) == torch.squeeze( labels.long())).sum().item() y_pred.append(outputs.argmax(1).cpu()) y_true.append(labels.squeeze().long().cpu()) pred, true = torch.cat(y_pred), torch.cat(y_true) train_loss = train_loss / len(pred) train_acc = train_acc / len(pred) print( "TRAIN-(%s) (%d/%d)>> loss: %.4f train_acc: %.4f" % ('lf_dnn', epochs - best_epoch, epochs, train_loss, train_acc)) val_acc, y_pred, y_true = do_test(model2, val_dataLoader, mode="VAL") # change the best value : weighted avg result_dict = classification_report(y_true, y_pred, digits=3, output_dict=True) val_acc = result_dict["weighted avg"]["f1-score"] print(f'weighted avg f1-score {val_acc}') if val_acc > best_acc: best_acc, best_epoch = val_acc, epochs print(model_path) if os.path.exists(model_path): os.remove(model_path) torch.save(model2.cpu().state_dict(), model_path) model2.to(device) # early stop if epochs - best_epoch >= early_stop: print(f'the best epochs:{best_epoch},the best acc:{best_acc}') return # break results = [] def five_fold(): def getSpeakerIndependent(): train_ind_SI, test_ind_SI = [], [] for ind, data in enumerate(data_input): if data[SHOW_ID] == "FRIENDS": test_ind_SI.append(ind) else: train_ind_SI.append(ind) train_index, test_index = train_ind_SI, test_ind_SI return np.array(train_index), np.array(test_index) def getSpeakerIndependent_ours(): train_ind_SI, test_ind_SI = [], [] test_speakers = ['HOWARD', 'SHELDON'] for idx, ID in enumerate(dataset_json.keys()): speaker = dataset_json[ID]["speaker"] if speaker in test_speakers: test_ind_SI.append(idx) else: train_ind_SI.append(idx) train_index, test_index = train_ind_SI, test_ind_SI return np.array(train_index), np.array(test_index) for fold in range(5): print(fold, '-' * 50) # (train_index, test_index) = getSpeakerIndependent() (train_index, test_index) = getSpeakerIndependent_ours() train_ind_SI = train_index val_ind_SI = test_index test_ind_SI = test_index print(train_ind_SI.shape, val_ind_SI.shape, test_ind_SI.shape) train_dataLoader = get_data_loader(train_ind_SI) val_dataLoader = get_data_loader(val_ind_SI) test_dataLoader = get_data_loader(test_ind_SI) model2 = LF_DNN() model2.to(device) do_train(model2, train_dataLoader, val_dataLoader) print() print(f'load:{model_path}') model2.load_state_dict(torch.load(model_path)) model2.to(device) # do test val_acc, y_pred, y_true = do_test(model2, test_dataLoader, mode="TEST") print('Test: ', val_acc) # print(pred,true) result_string = classification_report(y_true, y_pred, digits=3) print('confusion_matrix(y_true, y_pred)') print(confusion_matrix(y_true, y_pred)) print(result_string) result_dict = classification_report(y_true, y_pred, digits=3, output_dict=True) results.append(result_dict) # Dumping result to output if not os.path.exists(os.path.dirname(RESULT_FILE)): os.makedirs(os.path.dirname(RESULT_FILE)) with open(RESULT_FILE.format(model_name), 'w') as file: json.dump(results, file) print('dump results into ', RESULT_FILE.format(model_name)) return None five_fold() def printResult(model_name=None): results = json.load(open(RESULT_FILE.format(model_name), "rb")) weighted_precision, weighted_recall = [], [] weighted_fscores = [] print("#" * 20) for fold, result in enumerate(results): weighted_fscores.append(result["weighted avg"]["f1-score"]) weighted_precision.append(result["weighted avg"]["precision"]) weighted_recall.append(result["weighted avg"]["recall"]) print("Fold {}:".format(fold + 1)) print( "Weighted Precision: {} Weighted Recall: {} Weighted F score: {}" .format(result["weighted avg"]["precision"], result["weighted avg"]["recall"], result["weighted avg"]["f1-score"])) print("#" * 20) print("Avg :") print( "Weighted Precision: {:.3f} Weighted Recall: {:.3f} Weighted F score: {:.3f}" .format(np.mean(weighted_precision), np.mean(weighted_recall), np.mean(weighted_fscores))) tmp = { 'precision:': np.mean(weighted_precision), 'recall': np.mean(weighted_recall), 'f1': np.mean(weighted_fscores) } file_name = 'five_results_average' with open(RESULT_FILE.format(file_name), 'w') as file: json.dump(tmp, file) printResult(model_name=model_name)
def _load(self) -> List: with jsonlines.open(self._filepath, mode='r') as f: image_info = list(f) return image_info
def _read_dataset(self): """Open news dataset and divides contents by category.""" print( "Opening dataset News Category Dataset v2 (200k entries) and splitting data..." ) with jsonlines.open( os.path.join(ROOT, 'classifier', 'datasets', 'News_Category_Dataset_v2.json')) as news: for item in news.iter(type=dict, skip_invalid=True): cat = item['category'].lower() if 'style' in cat or 'home' in cat: self.categories['lifestyle'].append([ self.format_sentence(item['headline'].lower()), 'Lifestyle' ]) self.categories['lifestyle'].append([ self.format_sentence( item['short_description'].lower()), 'Lifestyle' ]) if 'food' in cat or 'taste' in cat: self.categories['food'].append([ self.format_sentence(item['headline'].lower()), 'Food' ]) self.categories['food'].append([ self.format_sentence( item['short_description'].lower()), 'Food' ]) if 'art' in cat: self.categories['arts'].append([ self.format_sentence(item['headline'].lower()), 'Arts' ]) self.categories['arts'].append([ self.format_sentence( item['short_description'].lower()), 'Arts' ]) if 'healthy' in cat: self.categories['health'].append([ self.format_sentence(item['headline'].lower()), 'Health' ]) self.categories['health'].append([ self.format_sentence( item['short_description'].lower()), 'Health' ]) if cat in self.categories.keys(): self.categories[cat].append([ self.format_sentence(item['headline'].lower()), cat.title() ]) self.categories[cat].append([ self.format_sentence( item['short_description'].lower()), cat.title() ]) print( "Done splitting data. Opening UCI News Aggregator (400k entries) and splitting data..." ) with open( os.path.join(ROOT, 'classifier', 'datasets', 'uci-news-aggregator.csv'), ) as input_csv: news_reader = csv.reader(input_csv, delimiter=",") for row in news_reader: if row[4] == 'b': self.categories['business'].append( [self.format_sentence(row[1].lower()), 'Business']) if row[4] == 't': self.categories['tech'].append( [self.format_sentence(row[1].lower()), 'Tech']) if row[4] == 'e': self.categories['entertainment'].append([ self.format_sentence(row[1].lower()), 'Entertainment' ]) if row[4] == 'm': self.categories['health'].append( [self.format_sentence(row[1].lower()), 'Health']) print("Done splitting ")
def open_file(self): return jsonlines.open(self.converted_filename, mode="w")
# import warnings # warnings.filterwarnings("error") import sys print(sys.getrecursionlimit()) sys.setrecursionlimit(5000) for file in os.listdir(os.path.join(prefix, 'clean_final')): path = os.path.join(os.path.join(prefix, 'clean_final', file)) data = [] if 'train' in file: continue with jsonlines.open(path) as reader: for obj in reader: data.append(obj) all_scores = defaultdict(dict) i = 0 final_documents = {} for bill in data: i += 1 if i % 50 == 0: print(i) summary = bill['clean_summary'] doc = bill['clean_text']
'f1_support': round(f1_support, 4), 'precision_support': round(precision_support, 4), 'recall_support': round(recall_support, 4), 'f1_contradict': round(f1_contradict, 4), 'precision_contradict': round(precision_contradict, 4), 'recall_contradict': round(recall_contradict, 4) } parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, required=True) parser.add_argument('--rationale-selection', type=str, required=True) parser.add_argument('--label-prediction', type=str, required=True) args = parser.parse_args() dataset = {data['id']: data for data in jsonlines.open(args.dataset)} rationale_selection = list(jsonlines.open(args.rationale_selection)) label_prediction = jsonlines.open(args.label_prediction) def get_gold_label(claim_id: int, doc_id: int): labels = { es['label'] for es in dataset[claim_id]['evidence'].get(str(doc_id)) or [] } if labels: return next(iter(labels)) else: return 'NOT_ENOUGH_INFO'
def write_jsonlines(array, file): with jsonlines.open(file, mode='w') as writer: writer.write_all(array)
def read_jsonlines(file, handler=lambda obj: obj): dat = [] with jsonlines.open(file) as reader: for obj in reader: dat.append(handler(obj)) return dat
def test_jsonl(self): obj = jsonlines.open(imd_data_, mode='r') for i in obj: print i
print("inside sent ret") dataset_paths = [ "fever_full_binary_dev_sent_ret_split1", "fever_full_binary_dev_sent_ret_split2", "fever_full_binary_dev_sent_ret_split3" ] model_path = "models_fever_full/sentence_retrieval_models/model_bert_fever_full_binaryAcc73.h5" model = load_model(model_path) print("model loaded") embeddings_paths = [ "fever_full_dev_binary_sent_ret_bert_60k", "fever_full_dev_binary_sent_ret_bert_60k_120k", "fever_full_dev_binary_sent_ret_bert_120k_plus" ] results = jsonlines.open(sr_results_path, mode="w") for i in range(len(dataset_paths)): dataset_path = "/scratch/kkuma12s/github/fact-validation/thesis-code/Proof_Extraction/data/fever-full/complete_pipeline/sent_ret/bert/" + dataset_paths[ i] + ".jsonl" test_model = testModel(dataset_path, model_path) claims_sent_vec_combined = test_model.load_compressed_pickle_file( "/scratch/kkuma12s/new_embeddings/" + embeddings_paths[i]) print("test data size ", test_model.test_data.shape) print("inside sentence retrieval and loading bert embeddings ") print("claims vec 1 ", claims_sent_vec_combined.shape) batch_size = 32 total_possible_batch_sizes = len( claims_sent_vec_combined) / batch_size print("total possible batch sizes ", total_possible_batch_sizes)
def load_rows(search_collection): return list(jsonlines.open(search_collection))
WHERE EXISTS ( SELECT new_path FROM UNNEST(difference) WHERE (new_path LIKE "%.py") ) AND regexp_contains(subject, 'bug|fix|issue|error') """ query_job = client.query(lang_query) # Make an API request. print("Querying languages:") counter = 0 headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': 'token ' + git_token} with jsonlines.open('data/data.jsonl', mode='w') as writer: for row in query_job: commit_sha = row.commit repo = row.repo_name[0] if (isinstance(row.repo_name, list)) else row.repo_name for diff in row.difference: old_file_path = diff['old_path'] new_file_path = diff['new_path'] url_before = "https://api.github.com/repos/%s/contents/%s?ref=%s" % (repo, old_file_path, commit_sha) url_after = "https://api.github.com/repos/%s/contents/%s?ref=%s" % (repo, new_file_path, commit_sha + '^') try:
download = False sc_folder = ".\\media\\sc\\" mov_folder = ".\\media\\mov\\" with open('top_games.json') as json_file: games = json.load(json_file) # Example game from top_games.json: # "570": {appid=570, name="Dota 2", developer="Valve", publisher="Valve", score_rank="", positive=1206694, # negative=221642, userscore=0, owners="100,000,000 .. 200,000,000", average_forever=36567, # average_2weeks=1517, median_forever=1236, median_2weeks=741, price="0", initialprice="0", # discount="0", ccu=618777} start_index = 5041 with jsonlines.open("detailed_games.jsonl", "a") as detailed_games: # Iterate through all top games for index in range(start_index, len(games.keys())): game = games[list(games)[index]] name = game["name"] clean_name = re.sub(r'[/\\*:?\"<>|]', '', name) # Clean name for use in file storage app_id = str(game["appid"]) print(str(index) + " Game: " + name) success = False timeout_count = 0 while not success: try: steam_details = requests.get(steam_store_api, params={"appids": app_id, "language": "english"}) steam_details = steam_details.json()
def test_write_out_subsentences(self): merged_entity_emb_file = tempfile.NamedTemporaryFile() out_file = tempfile.NamedTemporaryFile() data_file = tempfile.NamedTemporaryFile() entity_dir = "test/entity_db" entity_map_dir = "entity_mappings" alias_cand_map = "alias2qids.json" data_config = DottedDict(entity_dir=entity_dir, entity_map_dir=entity_map_dir, alias_cand_map=alias_cand_map) entitiy_symbols = EntitySymbolsSubclass() entitiy_symbols.dump(save_dir=os.path.join(entity_dir, entity_map_dir)) num_examples = 3 total_num_mentions = 7 M = 3 K = 2 hidden_size = 2 # create data file -- just needs aliases and sentence indices data = [{ 'aliases': ['a', 'b'], 'sent_idx_unq': 0 }, { 'aliases': ['c', 'd', 'e', 'f', 'g'], 'sent_idx_unq': 1 }] with jsonlines.open(data_file.name, 'w') as f: for row in data: f.write(row) merged_storage_type = np.dtype([('hidden_size', int), ('sent_idx', int), ('alias_list_pos', int), ('entity_emb', float, hidden_size), ('final_loss_pred', int), ('final_loss_prob', float), ('final_loss_cand_probs', float, K)]) merged_entity_emb = np.memmap(merged_entity_emb_file.name, dtype=merged_storage_type, mode="w+", shape=(total_num_mentions, )) # 2 sentences, 1st sent has 1 subsentence, 2nd sentence has 2 subsentences - 7 mentions total merged_entity_emb['hidden_size'] = hidden_size # first men merged_entity_emb[0]['sent_idx'] = 0 merged_entity_emb[0]['alias_list_pos'] = 0 merged_entity_emb[0]['entity_emb'] = np.array([0, 1]) merged_entity_emb[0]['final_loss_pred'] = 1 merged_entity_emb[0]['final_loss_prob'] = 0.9 merged_entity_emb[0]['final_loss_cand_probs'] = np.array([0.1, 0.9]) # second men merged_entity_emb[1]['sent_idx'] = 0 merged_entity_emb[1]['alias_list_pos'] = 1 merged_entity_emb[1]['entity_emb'] = np.array([2, 3]) merged_entity_emb[1]['final_loss_pred'] = 1 merged_entity_emb[1]['final_loss_prob'] = 0.9 merged_entity_emb[1]['final_loss_cand_probs'] = np.array([0.1, 0.9]) # third men merged_entity_emb[2]['sent_idx'] = 1 merged_entity_emb[2]['alias_list_pos'] = 0 merged_entity_emb[2]['entity_emb'] = np.array([4, 5]) merged_entity_emb[2]['final_loss_pred'] = 0 merged_entity_emb[2]['final_loss_prob'] = 0.9 merged_entity_emb[2]['final_loss_cand_probs'] = np.array([0.9, 0.1]) # fourth men merged_entity_emb[3]['sent_idx'] = 1 merged_entity_emb[3]['alias_list_pos'] = 1 merged_entity_emb[3]['entity_emb'] = np.array([6, 7]) merged_entity_emb[3]['final_loss_pred'] = 0 merged_entity_emb[3]['final_loss_prob'] = 0.9 merged_entity_emb[3]['final_loss_cand_probs'] = np.array([0.9, 0.1]) # fifth men merged_entity_emb[4]['sent_idx'] = 1 merged_entity_emb[4]['alias_list_pos'] = 2 merged_entity_emb[4]['entity_emb'] = np.array([10, 11]) merged_entity_emb[4]['final_loss_pred'] = 1 merged_entity_emb[4]['final_loss_prob'] = 0.9 merged_entity_emb[4]['final_loss_cand_probs'] = np.array([0.1, 0.9]) # sixth men merged_entity_emb[5]['sent_idx'] = 1 merged_entity_emb[5]['alias_list_pos'] = 3 merged_entity_emb[5]['entity_emb'] = np.array([12, 13]) merged_entity_emb[5]['final_loss_pred'] = 1 merged_entity_emb[5]['final_loss_prob'] = 0.9 merged_entity_emb[5]['final_loss_cand_probs'] = np.array([0.1, 0.9]) # seventh men merged_entity_emb[6]['sent_idx'] = 1 merged_entity_emb[6]['alias_list_pos'] = 4 merged_entity_emb[6]['entity_emb'] = np.array([14, 15]) merged_entity_emb[6]['final_loss_pred'] = 1 merged_entity_emb[6]['final_loss_prob'] = 0.9 merged_entity_emb[6]['final_loss_cand_probs'] = np.array([0.1, 0.9]) num_processes = 2 train_in_candidates = True dump_embs = True write_data_labels(num_processes, merged_entity_emb_file.name, merged_storage_type, data_file.name, out_file.name, train_in_candidates, dump_embs, data_config) ''' "a":[["Q1",10.0],["Q4",6]], "b":[["Q2",5.0],["Q1",3]], "c":[["Q1",30.0],["Q2",3]], "d":[["Q4",20],["Q3",15.0]], "e":[["Q1",10.0],["Q4",6]], "f":[["Q2",5.0],["Q1",3]], "g":[["Q1",30.0],["Q2",3]] ''' all_lines = [] with open(out_file.name) as check_f: for line in check_f: all_lines.append(ujson.loads(line)) gold_lines = [{ 'sent_idx_unq': 0, 'aliases': ['a', 'b'], 'qids': ["Q4", "Q1"], 'probs': [0.9, 0.9], 'cands': [["Q1", "Q4"], ["Q2", "Q1"]], 'cand_probs': [[0.1, 0.9], [0.1, 0.9]], 'entity_ids': [4, 1], 'ctx_emb_ids': [0, 1] }, { 'sent_idx_unq': 1, 'aliases': ['c', 'd', 'e', 'f', 'g'], 'qids': ["Q1", "Q4", "Q4", "Q1", "Q2"], 'probs': [0.9, 0.9, 0.9, 0.9, 0.9], 'cands': [["Q1", "Q2"], ["Q4", "Q3"], ["Q1", "Q4"], ["Q2", "Q1"], ["Q1", "Q2"]], 'cand_probs': [[0.9, 0.1], [0.9, 0.1], [0.1, 0.9], [0.1, 0.9], [0.1, 0.9]], 'entity_ids': [1, 4, 4, 1, 2], 'ctx_emb_ids': [2, 3, 4, 5, 6] }] assert len(all_lines) == len(gold_lines) for i in range(len(gold_lines)): self.assertDictEqual( gold_lines[i], all_lines[i], f"{ujson.dumps(gold_lines[i], indent=4)} VS {ujson.dumps(all_lines[i], indent=4)}" ) # clean up if os.path.exists(entity_dir): shutil.rmtree(entity_dir) merged_entity_emb_file.close() out_file.close() data_file.close()
def prepare_SLURP(data_folder, save_folder, slu_type, train_splits, skip_prep=False): """ This function prepares the SLURP dataset. If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded. data_folder : path to SLURP dataset. save_folder: path where to save the csv manifest files. slu_type : one of the following: "direct":{input=audio, output=semantics} "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle) "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts) train_splits : list of splits to be joined to form train .csv skip_prep: If True, data preparation is skipped. """ if skip_prep: return # If the data folders do not exist, we need to download/extract the data if not os.path.isdir(os.path.join(data_folder, "slurp_synth")): # Check for zip file and download if it doesn't exist zip_location = os.path.join(data_folder, "slurp_synth.tar.gz") if not os.path.exists(zip_location): url = "https://zenodo.org/record/4274930/files/slurp_synth.tar.gz?download=1" download_file(url, zip_location, unpack=True) else: print("Extracting slurp_synth...") shutil.unpack_archive(zip_location, data_folder) if not os.path.isdir(os.path.join(data_folder, "slurp_real")): # Check for zip file and download if it doesn't exist zip_location = os.path.join(data_folder, "slurp_real.tar.gz") if not os.path.exists(zip_location): url = "https://zenodo.org/record/4274930/files/slurp_real.tar.gz?download=1" download_file(url, zip_location, unpack=True) else: print("Extracting slurp_real...") shutil.unpack_archive(zip_location, data_folder) splits = [ "train_real", "train_synthetic", "devel", "test", ] id = 0 for split in splits: new_filename = (os.path.join(save_folder, split) + "-type=%s.csv" % slu_type) if os.path.exists(new_filename): continue print("Preparing %s..." % new_filename) IDs = [] duration = [] wav = [] wav_format = [] wav_opts = [] semantics = [] semantics_format = [] semantics_opts = [] transcript = [] transcript_format = [] transcript_opts = [] jsonl_path = os.path.join(data_folder, split + ".jsonl") if not os.path.isfile(jsonl_path): if split == "train_real": url_split = "train" else: url_split = split url = ( "https://github.com/pswietojanski/slurp/raw/master/dataset/slurp/" + url_split + ".jsonl") download_file(url, jsonl_path, unpack=False) with jsonlines.open(jsonl_path) as reader: for obj in reader: scenario = obj["scenario"] action = obj["action"] sentence_annotation = obj["sentence_annotation"] num_entities = sentence_annotation.count("[") entities = [] for slot in range(num_entities): type = (sentence_annotation.split("[")[slot + 1].split("]") [0].split(":")[0].strip()) filler = (sentence_annotation.split("[")[slot + 1].split( "]")[0].split(":")[1].strip()) entities.append({"type": type, "filler": filler}) for recording in obj["recordings"]: IDs.append(id) if "synthetic" in split: audio_folder = "slurp_synth/" else: audio_folder = "slurp_real/" path = os.path.join(data_folder, audio_folder, recording["file"]) signal = read_audio(path) duration.append(signal.shape[0] / 16000) wav.append(path) wav_format.append("flac") wav_opts.append(None) transcript_ = obj["sentence"] if slu_type == "decoupled": transcript_ = transcript_.upper() transcript.append(transcript_) transcript_format.append("string") transcript_opts.append(None) semantics_dict = { "scenario": scenario, "action": action, "entities": entities, } semantics_ = str(semantics_dict).replace( ",", "|" ) # Commas in dict will make using csv files tricky; replace with pipe. semantics.append(semantics_) semantics_format.append("string") semantics_opts.append(None) id += 1 df = pd.DataFrame({ "ID": IDs, "duration": duration, "wav": wav, "semantics": semantics, "transcript": transcript, }) df.to_csv(new_filename, index=False) # Merge train splits train_splits = [ split + "-type=%s.csv" % slu_type for split in train_splits ] merge_csvs(save_folder, train_splits, "train-type=%s.csv" % slu_type)
def test_merge_subsentences(self): test_full_emb_file = tempfile.NamedTemporaryFile() test_merged_emb_file = tempfile.NamedTemporaryFile() gold_merged_emb_file = tempfile.NamedTemporaryFile() num_examples = 3 total_num_mentions = 7 M = 3 K = 2 hidden_size = 2 # create full embedding file storage_type_full = np.dtype([('M', int), ('K', int), ('hidden_size', int), ('sent_idx', int), ('subsent_idx', int), ('alias_list_pos', int, M), ('entity_emb', float, M * hidden_size), ('final_loss_true', int, M), ('final_loss_pred', int, M), ('final_loss_prob', float, M), ('final_loss_cand_probs', float, M * K)]) full_emb = np.memmap(test_full_emb_file.name, dtype=storage_type_full, mode='w+', shape=(num_examples, )) # 2 sentences, 1st sent has 1 subsentence, 2nd sentence has 2 subsentences # first sentence full_emb['hidden_size'] = hidden_size full_emb['M'] = M full_emb['K'] = K full_emb[0]['sent_idx'] = 0 full_emb[0]['subsent_idx'] = 0 # last alias is padded full_emb[0]['alias_list_pos'] = np.array([0, 1, -1]) full_emb[0]['final_loss_true'] = np.array([0, 1, -1]) # entity embs are flattened full_emb[0]['entity_emb'] = np.array([0, 1, 2, 3, 0, 0]) full_emb[1]['sent_idx'] = 1 full_emb[1]['subsent_idx'] = 0 full_emb[1]['alias_list_pos'] = np.array([0, 1, 2]) # last alias goes with next subsentence full_emb[1]['final_loss_true'] = np.array([1, 1, -1]) full_emb[1]['entity_emb'] = np.array([4, 5, 6, 7, 8, 9]) full_emb[2]['sent_idx'] = 1 full_emb[2]['subsent_idx'] = 1 full_emb[2]['alias_list_pos'] = np.array([2, 3, 4]) full_emb[2]['final_loss_true'] = np.array([1, 1, 1]) full_emb[2]['entity_emb'] = np.array([10, 11, 12, 13, 14, 15]) # create merged embedding file storage_type_merged = np.dtype([('hidden_size', int), ('sent_idx', int), ('alias_list_pos', int), ('entity_emb', float, hidden_size), ('final_loss_pred', int), ('final_loss_prob', float), ('final_loss_cand_probs', float, K)]) merged_emb_gold = np.memmap(gold_merged_emb_file.name, dtype=storage_type_merged, mode="w+", shape=(total_num_mentions, )) merged_emb_gold['entity_emb'] = np.array([[0, 1], [2, 3], [4, 5], [6, 7], [10, 11], [12, 13], [14, 15]]) # create data file -- just needs aliases and sentence indices data = [{ 'aliases': ['a', 'b'], 'sent_idx_unq': 0 }, { 'aliases': ['c', 'd', 'e', 'f', 'g'], 'sent_idx_unq': 1 }] temp_file = tempfile.NamedTemporaryFile(delete=False).name with jsonlines.open(temp_file, 'w') as f: for row in data: f.write(row) # assert that output of merge_subsentences is correct num_processes = 2 eval_utils.merge_subsentences(num_processes, temp_file, test_merged_emb_file.name, storage_type_merged, test_full_emb_file.name, storage_type_full, dump_embs=True) bootleg_merged_emb = np.memmap(test_merged_emb_file.name, dtype=storage_type_merged, mode="r+") merged_emb_gold = np.memmap(gold_merged_emb_file.name, dtype=storage_type_merged, mode="r+") assert len(bootleg_merged_emb) == total_num_mentions for i in range(len(bootleg_merged_emb)): assert np.array_equal(bootleg_merged_emb[i]['entity_emb'], merged_emb_gold[i]['entity_emb']) # clean up if os.path.exists(temp_file): os.remove(temp_file) test_full_emb_file.close() test_merged_emb_file.close() gold_merged_emb_file.close()
def create_current_writer(i): chunk_no = math.floor(i / CHUNK_SIZE) output_filename = OUT_DIR + '/ocdata.chunk' + str(chunk_no) + '.jsonl' current_writer = jsonlines.open(output_filename, mode='w') return current_writer
def write_jsonl(filename, items): with jsonlines.open(filename, mode='w', dumps=json.JSONEncoder(default=json_serial).encode) as writer: writer.write_all(items)
def write_jsonlist(list_of_json_objects, output_filename): with jsonlines.open(output_filename, mode='w') as writer: writer.write_all(list_of_json_objects)
parser = argparse.ArgumentParser() parser.add_argument('--corpus', type=str, required=True) parser.add_argument('--dataset', type=str, required=True) parser.add_argument('--model', type=str, required=True) parser.add_argument('--rationale-selection', type=str, required=True) parser.add_argument( '--mode', type=str, default='claim_and_rationale', choices=['claim_and_rationale', 'only_claim', 'only_rationale']) parser.add_argument('--output', type=str, required=True) args = parser.parse_args() print(args.mode) corpus = {doc['doc_id']: doc for doc in jsonlines.open(args.corpus)} dataset = jsonlines.open(args.dataset) rationale_selection = jsonlines.open(args.rationale_selection) output = jsonlines.open(args.output, 'w') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'Using device "{device}"') tokenizer = AutoTokenizer.from_pretrained(args.model) config = AutoConfig.from_pretrained(args.model, num_labels=3) model = AutoModelForSequenceClassification.from_pretrained( args.model, config=config).eval().to(device) LABELS = ['CONTRADICT', 'NOT_ENOUGH_INFO', 'SUPPORT']
for xsf in strinngfeatures: f1.append(G_features[xsf]) G_features[xsf] = 0 f2 = np.array(f1) instructions['instructions'] = l_instructions f3.append(f2) instructions['features'] = l_features return instructions p_DataSet = [] c1 = 0 f3 = [] y1 = [] y2 = [] with jsonlines.open("train_dataset.jsonl") as reader: for _instructions in reader: #(type=dict, skip_invalid=True): p_DataSet.append(Features(_instructions)) y1.append(G_opt[_instructions['opt']]) y2.append(G_compiler[_instructions['compiler']]) #np.array([int(c) for c in ]) db = file_archive("DbMalware.txt") db['featuresnames'] = strinngfeatures db['featuresdata'] = f3 db['opt'] = y1 db['G_opt'] = G_opt db['compiler'] = y2 db['G_compiler'] = G_compiler db.dump() print('done')
if len(sys.argv) == 3: if sys.argv[2] == 'n': POWER = sys.argv[2] elif sys.argv[2] == 'y': POWER = sys.argv[2] else: print( "Specify y for including power and n for not including it e.g.: python bagofwords.py s n" ) exit() else: print( "Specify s for sender or r for receiver e.g.: python bagofwords.py s" ) exit() #import data. Specify directory path data_path = 'data/' with jsonlines.open(data_path + 'train.jsonl', 'r') as reader: train = list(reader) #VAL NOT USED IN LOG REG FOR CONSISTENCY WITH NEURAL #with jsonlines.open(data_path+'validation.jsonl', 'r') as reade #dev = list(reader) with jsonlines.open(data_path + 'test.jsonl', 'r') as reader: test = list(reader) #spacy used for tokenization nlp = English() log_reg(train, test)
def test_invalid_mode(): with pytest.raises(ValueError) as exc: jsonlines.open('foo', mode='foo') assert 'mode' in str(exc.value)
import numpy as np import pandas as pd import os from tqdm import tqdm import jsonlines from input.natural_questions.test_utils import simplify_nq_example import json json_dir = 'v1.0-simplified_nq-dev-all.jsonl' dict_list = [] with open(json_dir) as f: for line in tqdm(f): dict_list.append(simplify_nq_example(json.loads(line))) with jsonlines.open('simplified-nq-valid.jsonl', 'w') as writer: writer.write_all(dict_list)
def w_into_file(name): """Save data to .xlsx file.""" workbook = xlsxwriter.Workbook('excel/' + name + '.xls') worksheet = workbook.add_worksheet() worksheet.write('A1', MAIN_FIELDS[0]) # Name worksheet.write('B1', MAIN_FIELDS[1]) # Specialty worksheet.write('C1', MAIN_FIELDS[2]) # Email worksheet.write('D1', MAIN_FIELDS[3]) # Emirates worksheet.write('E1', MAIN_FIELDS[4]) # Contact worksheet.write('F1', MAIN_FIELDS[5]) # Fax worksheet.write('G1', MAIN_FIELDS[6]) # Postal Code worksheet.write('H1', MAIN_FIELDS[7]) # Website worksheet.write('I1', MAIN_FIELDS[8]) # Address worksheet.write('J1', MAIN_FIELDS[9]) # Link items = len(os.listdir("tmp")) i = 0 for itm in range(items): with jsonlines.open("tmp/" + str(itm) + ".jsonl") as reader: for k, item in enumerate(reader): try: worksheet.write('A' + str(i + 2), item['Name']) except KeyError: worksheet.write('A' + str(i + 2), '') try: worksheet.write('B' + str(i + 2), item['Specialty:']) except KeyError: worksheet.write('B' + str(i + 2), '') try: worksheet.write('C' + str(i + 2), item['Email:']) except KeyError: worksheet.write('C' + str(i + 2), '') try: worksheet.write('D' + str(i + 2), item['Emirates:']) except KeyError: worksheet.write('D' + str(i + 2), '') try: worksheet.write('E' + str(i + 2), item['Contact:']) except KeyError: worksheet.write('E' + str(i + 2), '') try: worksheet.write('F' + str(i + 2), item['Fax:']) except KeyError: worksheet.write('F' + str(i + 2), '') try: worksheet.write('G' + str(i + 2), item['Postal Code:']) except KeyError: worksheet.write('G' + str(i + 2), '') try: worksheet.write('H' + str(i + 2), item['Website:']) except KeyError: worksheet.write('H' + str(i + 2), '') try: worksheet.write('I' + str(i + 2), item['Address:']) except KeyError: worksheet.write('I' + str(i + 2), '') try: worksheet.write('J' + str(i + 2), item['Link']) except KeyError: worksheet.write('J' + str(i + 2), '') i += 1 workbook.close()
def preprocess_qdraw(): img_size = 128 category = 'baseball' output = 'data/qdraw_{cat}_{img_size}'.format(cat=category, img_size=img_size) if os.path.exists(output): import shutil shutil.rmtree(output) os.mkdir(output) stroke_width = 2 bbox_pad = 20 cmap = plt.get_cmap('jet') need = 200000 with jsonlines.open('data/{cat}.ndjson'.format(cat=category)) as reader: for count, obj in enumerate(reader): # print obj fn = output + '/' + obj['key_id'] + '.svg' if not os.path.isfile(fn): print(count, fn) dwg = svgwrite.Drawing(fn, profile='tiny', size=(img_size,img_size)) drawing = obj['drawing'] num_strokes = len(drawing) cnorm = colors.Normalize(vmin=0, vmax=num_strokes-1) cscalarmap = cmx.ScalarMappable(norm=cnorm, cmap=cmap) # get bbox bbox = [100000, 100000, -100000, -100000] for i, strokes in enumerate(drawing): x = strokes[0] y = strokes[1] bbox[0] = min(bbox[0], np.amin(x)) bbox[1] = min(bbox[1], np.amin(y)) bbox[2] = max(bbox[2], np.amax(x)) bbox[3] = max(bbox[3], np.amax(y)) bbox[0] -= bbox_pad bbox[1] -= bbox_pad bbox[2] += bbox_pad bbox[3] += bbox_pad # make it square dx = bbox[2]-bbox[0] dy = bbox[3]-bbox[1] b_size = float(max(dx,dy)) # normalize and save for i, strokes in enumerate(drawing): x = (np.asarray(strokes[0]) - bbox[0])/b_size*img_size y = (np.asarray(strokes[1]) - bbox[1])/b_size*img_size # t = strokes[2] c = np.asarray(cscalarmap.to_rgba(i))[:3]*255 c_hex = '#%02x%02x%02x' % (int(c[0]), int(c[1]), int(c[2])) dwg.add(dwg.polyline(points=zip(x, y), stroke=c_hex, fill='none', stroke_width=stroke_width)) dwg.viewbox(0, 0, img_size, img_size) dwg.save() if count >= need: break # split dataset file_list = [] for root, _, files in os.walk(output): for file in files: file_list.append(file) num_files = len(file_list) ids = np.random.permutation(num_files) train_id = int(num_files * 0.9) with open(os.path.join(output,'train.txt'), 'w') as f: for id in ids[:train_id]: f.write(file_list[id] + '\n') with open(os.path.join(output,'test.txt'), 'w') as f: for id in ids[train_id:]: f.write(file_list[id] + '\n')
from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import TfidfVectorizer from stop_words import get_stop_words from string import punctuation from nltk.tokenize import word_tokenize from nltk.stem.snowball import SnowballStemmer from sklearn.metrics import classification_report from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold from sklearn.tree import DecisionTreeClassifier X = [] y = [] z = [] with jsonlines.open('data/reviews.txt', 'r') as f: for item in f: X.append(item['text']) y.append(item['voted_up']) z.append(item['early_access']) def detect_text(text): try: language = detect(text) except: language = "unknown" return language def stem(tokens, lang):
import raw_data_process as rdp import jsonlines with jsonlines.open('vk_comms.jl') as comments: with jsonlines.open('norm_comms.jl', mode='w') as writer: for obj in comments: line = rdp.jl_line(item=obj) sentences = line.normalize() for item in sentences: writer.write(item)
except: pass #return (answer_x, answer_binary) # If it's accept, convert the label of the correct one to 1, the others to 0, return all # If it's reject, convert the label of the presented one to 0, and DELETE the rows in the # matrix/vector. If the presented one is false, we don't know if the other, non-presented # ones were correct or not. # return the text labels, too, so we can look at per-country accuracy later. # feat_list.append(feat) error_count = 0 with jsonlines.open('geo_annotated/geo_country_db.jsonl') as reader: X = [] Y = [] for obj in reader: if obj['answer'] != 'ignore': try: x, label = entry_to_matrix(obj) # change to return matrices/vectors X.append(x) Y.append(label) except Exception as e: error_count += 1 pass print(error_count) # format numpy
import json import time import random import jsonlines from requests import * pp = pprint.PrettyPrinter(width=100) apikey = '' apisecret = '' ping_url = 'https://api-v3.receptiviti.com/v3/api/ping' content_api_url = 'https://api-v3.receptiviti.com/v3/api/content' author_texts = pickle.load(open('/home/username/data/output/_jobs/liwc_posts_merged.pickle','rb')) with jsonlines.open('/home/username/liwc_scores.jsonl', mode='w') as writer: for i, (author, text) in author_texts[:1]: response_json = test_ping() writer.write(response_json) print('#' + i + ' - ' + author + ' done.') time.sleep(random.randint(2, 6)) def merge(posts): _sorted = map(lambda x: x['text'], sorted(posts, key=lambda x: (x['link_id'], int(x['created_utc'])))) return ' '.join(_sorted) def test_ping(): headers = auth_headers(apikey, apisecret) print("PING URL:---------> " + ping_url)
import jsonlines if __name__ == "__main__": if len(sys.argv) == 1: logging.error("Usage: {} /path/to/json/file".format(sys.argv[0])) exit(1) if len(sys.argv) == 3: jsonl_dir = sys.argv[2] else: jsonl_dir = "./" json_path = Path(sys.argv[1]) json_files = json_path.glob("*.json") if json_path.is_dir() else [ json_path ] for json_file in json_files: if json_file.is_file(): jsonl_file = Path(jsonl_dir, json_file.name).with_suffix(".jsonl") with open(json_file) as f: l = [{k: v for k, v in o.items() if v is not None} for o in json.load(f)] with jsonlines.open(jsonl_file, mode='w') as writer: writer.write_all(l) else: logging.warning("Failed to open {}, skip".format(json_file))
def do_driver_lookups(articles, mediod_indices, drivers, num=5): drivertok = tokenize_texts(drivers) driver_jsonl = [] for i, mdi in enumerate(mediod_indices): # pdb.set_trace() results = ranker.article2queries(word_tokenize(articles[mdi]), drivertok, num) results = [res['driver'] for res in results] driver_jsonl.append(get_driver_dict(results, i)) return driver_jsonl with open(data_labels_path, 'rb') as rpkl: data, labels = pickle.load(rpkl) articles = [] with jsonlines.open(article_jsonlines_path, 'r') as jr: for line in jr: articles.append(line) article_text = [ar['text'] for ar in articles] with open(driverpath, 'r') as dr: drivers = [dr.strip().replace('\n', '') for dr in dr.readlines()] mediod_indices = get_mediod_indices(data, labels) driver_jsonl = do_driver_lookups(article_text, mediod_indices, drivers) with jsonlines.open(outputpath, 'w') as jw: jw.write_all(driver_jsonl)