Beispiel #1
0
def read_proxies(filepath):
    proxy_list = []
    with jsonlines.open(filepath) as reader:
        for i, state in enumerate(reader):
            proxy_list.append(state)
        reader.close()
    return proxy_list
Beispiel #2
0
def read_states_from_jsonl(filename):
    """Reads and returns data from .jsonl file.
    Returns list of data"""
    states = []
    with jsonlines.open(filename) as reader:
        for i, state in enumerate(reader):
            if (i == 0) or (i == 'None'):
                pass
            else:
                states.append(state)
        reader.close()
    return states
Beispiel #3
0
def generate_jsonlines(input, output):
    import jsonlines

    pn_file = open(input, 'rU')
    raw = json.load(pn_file)
    pn_file.close()
    hits = raw['hits']['hits']

    obj = jsonlines.open(output, mode='w')
    for hit in hits:
        source = hit[WEBPAGE_SOURCE]
        obj.dump(source)
Beispiel #4
0
def load_intermediate_data(path, format='jsonlines'):
    dataset = []
    if format == 'jsonlines':
        import jsonlines
        lines = jsonlines.open(path, mode='r')
        for line in lines:
            dataset.append([line['sid'], line['content']])
    elif format == 'csv':
        import csv
        with open(path, 'rb') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                dataset.append(row)
    return dataset
Beispiel #5
0
def generate_intermediate_data(dataset, output_path, format='jsonlines'):
    if format == 'jsonlines':
        import jsonlines
        obj = jsonlines.open(output_path, mode='w')
        for data in dataset:
            # obj.dump([{'sid': data[0], 'content': data[1]}])
            obj.dump({'sid': data[0], 'content': data[1]})

    elif format == 'csv':
        import csv
        with open(output_path, 'wb') as csvfile:
            spamwriter = csv.writer(csvfile)
            for data in dataset:
                spamwriter.writerow(data)
Beispiel #6
0
def read_states_from_excel():
    """Reads data from excel file and writes this data to .jsonl file.
    Returns file path."""
    wb = load_workbook(filename='states/Top50CitiesinUSwithZipCodes.xlsx')
    ws = wb.get_active_sheet()

    with jsonlines.open('states/tmpStates.jsonl', 'w') as f:
        for row in ws.rows:
            tmp_list = []
            for cell in row:
                tmp_list.append(str(cell.value))
            f.write(tmp_list)
        f.close()
    return 'states/tmpStates.jsonl'
Beispiel #7
0
def startCollectProxies():
    """Moving to site and collect proxies"""
    driver = webdriver.Firefox()
    driver.get("http://hideme.ru/proxy-list/?maxtime=250&type=45#list")

    with jsonlines.open('proxies/proxy.jsonl', 'w') as f:
        cont = driver.find_element_by_xpath('//tbody')
        for i, k in enumerate(cont.find_elements_by_xpath('//tr')):
            if i == 0:
                pass
            else:
                temp_dict = k.text
                f.write(conv_str(temp_dict))
        f.close()

    time.sleep(1)
    driver.quit()
    return 'proxies/proxy.jsonl'
Beispiel #8
0
def get_proxy(html):
    soup = BeautifulSoup(html, 'lxml')
    table = soup.find('table', class_='proxy__t')

    with jsonlines.open('proxy/proxy.jsonl', 'w') as f:
        tbody = table.find('tbody')
        for i, tr in enumerate(tbody.find_all('tr')):
            pr_list = []
            for k, td in enumerate(tr.find_all('td')):
                if (k == 0) or (k == 1):
                    pr_list.append(td.text)
                elif k == 4:
                    pr_list.append(conv_str(td.text))
                else:
                    pass
            f.write(pr_list)
        f.close()
    return 'proxy/proxy.jsonl'
Beispiel #9
0
def main():
    root_path = os.path.join('data', 'datasets', 'sst')
    if not os.path.exists(root_path):
        os.makedirs(root_path)
    os.chdir(root_path)

    base_url = 'https://raw.githubusercontent.com/PrincetonML/SIF/master/data'
    filenames = ['sentiment-train', 'sentiment-dev', 'sentiment-test']
    for filename in filenames:
        if not os.path.exists(filename):
            os.system('wget %s' % (base_url + '/' + filename))

    spacy_nlp = spacy.load('en')

    for filename in filenames:
        print(filename)
        with codecs.open(filename, 'r', 'utf-8') as f, jsonlines.open(filename + '.jsonl', mode='w') as writer:
            lines = [l for l in f]
            progress = tqdm(lines, mininterval=1, leave=False)
            for line in progress:
                text, label = line.strip().split('\t')
                text_spacy = ' '.join([t.text for t in spacy_nlp(text)])
                writer.write({'text': text_spacy, 'label': label})
Beispiel #10
0
def main():
    description()

    page_count = get_page_count(get_html_proxy(BASE_URL))

    print('Total pages: %s \n\n' % page_count)

    file_name = str(input("Name your output file, e.g. (ParsedData): ---> "))
    t = input("Type timeouts in seconds between every 8 parsed doctors (15 e.g.): ---> ")
    start_page = input("Type page number, where are you want to start: ---> ")
    end_page = input("Type page number, where are you want to stop: ---> ")
    print("\n\n\n")

    if t == '':
        t = 20
    elif int(t) < 1:
        t = 1
    elif int(t) > 60:
        t = 60
    else:
        t = int(t)

    if start_page == '':
        start_page = 0
    else:
        start_page = int(start_page)

    if end_page == '':
        end_page = page_count
    else:
        end_page = int(end_page)

    print("Start collecting proxies...")
    proxy_list = read_proxies(get_proxy(get_html_proxy(PROXY_SITE)))
    print(proxy_list)
    print("Proxies had been collected!")

    print("Let\'s start parse:")

    pcounter = start_page

    for page in range(start_page, end_page + 1):
        doctors = []

        if page == 0:
            link = BASE_URL
        else:
            link = BASE_URL + '/' + str(page)

        doctors.extend(parse(get_html(link)))

        with jsonlines.open('tmp/' + str(pcounter) + '.jsonl', 'w') as tf:
            for item in doctors:
                tf.write(item)
            tf.close()

        pcounter += 1
        print('Parsed page: %s' % pcounter)
        print('Parsing process: %d%%' % ((page / end_page) * 100))

        time.sleep(t)   # Timeouts

    print("\n\n\nParsing end")
    print("\nStart saving parsed data...\n")

    w_into_file(file_name)

    print("Finish! Now you can to close this program. ;)\n")
    q = input("Type \'q\' and press ENTER to close this program :) ---> ")
    if q == 'q':
        quit()
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"@", "", string)
    return string.lower()


MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1111806

count = 0
full_count = 0
train_val_data = []
test_data = []

with jsonlines.open('instances.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        count += 1
        full_count += 1
        if (count > 17600):
            test_data.append(obj)
        if (count <= 17600):
            train_val_data.append(obj)

count = 0
truth_data = []
with jsonlines.open('truth.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        truth_data.append(obj)

run(train_val_data, test_data, truth_data)
Beispiel #12
0
def read_data(data_file):
	data = []
	with jsonlines.open(data_file) as reader:
		for obj in reader:
			data.append(obj)
	return data
CHUNK_SIZE = 100000
# Optional limit after which to abort chunking.
LIMIT = None

# Create current writer for chunked segment.
def create_current_writer(i):
    chunk_no = math.floor(i / CHUNK_SIZE)
    output_filename = OUT_DIR + '/ocdata.chunk' + str(chunk_no) + '.jsonl'
    current_writer = jsonlines.open(output_filename, mode='w')
    return current_writer

print("Chunking data from input: " + SOURCE_PATH)

i = 0
current_writer = create_current_writer(i)
reader = jsonlines.open(SOURCE_PATH)
for obj in tqdm(reader):
    if ((i != 0) and (i % CHUNK_SIZE == 0)):
        current_writer.close()
        current_writer = create_current_writer(i)
    # Stop on limit if any.
    if (LIMIT != None and i >= LIMIT):
        break
    # Add line number.
    obj['_line'] = i
    current_writer.write(obj)
    i = i + 1
    
reader.close()
current_writer.close()
print("Done chunking " + str(i) + " records.")
Beispiel #14
0
def print_hi(name):
    print(f'Hi, {name}')
    DATA_PATH_JSON = work_dir + "data/sarcasm_data.json"
    BERT_TARGET_EMBEDDINGS = work_dir + "data/bert-output.jsonl"
    INDICES_FILE = work_dir + "data/split_indices.p"
    AUDIO_PICKLE = work_dir + "data/audio_features.p"
    BATCH_SIZE = 32
    model_path = os.path.join(work_dir + 'saved', f'lfdnn-mustard-M.pth')
    model_name = 'lf_dnn'

    RESULT_FILE = work_dir + "output/independent/ta/{}.json"

    device = torch.device('cuda:%d' %
                          0 if torch.cuda.is_available() else 'cpu')
    print(f'device: {device}')
    print()

    def pickle_loader(filename):
        if sys.version_info[0] < 3:
            return pickle.load(open(filename, 'rb'))
        else:
            return pickle.load(open(filename, 'rb'), encoding="latin1")

    class MMDataset(Dataset):
        def __init__(self, text_feature, video_feature_mean,
                     audio_feature_mean, label_out):
            # print('MMDataset')
            self.vision = video_feature_mean
            self.text = text_feature
            self.audio = audio_feature_mean
            self.label = label_out
            # print('self.label')

        def __len__(self):
            return len(self.label)

        def __getitem__(self, index):
            # print('index')
            # print(index)
            sample = {
                'text': torch.Tensor(self.text[index]),
                'vision': torch.Tensor(self.vision[index]),
                'audio': torch.Tensor(self.audio[index]),
                'labels':
                torch.Tensor(self.label[index]).type(torch.LongTensor)
            }

            return sample

    dataset_json = json.load(open(DATA_PATH_JSON))
    print(type(dataset_json), len(dataset_json))  # dict 690
    print(list(dataset_json.keys())[:2])
    tmp = list(dataset_json.keys())[:2]
    print(tmp[0])
    tmp = dataset_json[tmp[0]]
    print(tmp)

    # text
    text_bert_embeddings = []

    with jsonlines.open(BERT_TARGET_EMBEDDINGS) as reader:
        print('opend bert : ', BERT_TARGET_EMBEDDINGS)
        for obj in reader:
            CLS_TOKEN_INDEX = 0
            features = obj['features'][CLS_TOKEN_INDEX]
            bert_embedding_target = []
            for layer in [0, 1, 2, 3]:
                bert_embedding_target.append(
                    np.array(features["layers"][layer]["values"]))

            bert_embedding_target = np.mean(bert_embedding_target, axis=0)
            # print(bert_embedding_target.shape) 768
            text_bert_embeddings.append(np.copy(bert_embedding_target))
    print('np.array(text_bert_embeddings).shape bert 768 ')
    print(np.array(text_bert_embeddings).shape)  # 690 768

    # video
    video_features_file = h5py.File(
        work_dir + 'data/features/utterances_final/resnet_pool5.hdf5')
    # combined feature index
    # audio dict (283 12)   (283 11)
    audio_features = pickle_loader(AUDIO_PICKLE)

    TEXT_ID = 0
    VIDEO_ID = 1
    AUDIO_ID = 2
    SHOW_ID = 3
    # parse_data
    data_input, data_output = [], []
    # data_input [(text,video)(text,video)]
    # text:768 vide0: frame:2048
    for idx, ID in enumerate(dataset_json.keys()):
        # print(idx, 'processing ... ', ID) 0 processing ...  1_60
        data_input.append((
            text_bert_embeddings[idx],  # 0 TEXT_ID
            video_features_file[ID][()],  # 1 VIDEO_ID
            audio_features[ID],  # 2 AUDIO_ID
            dataset_json[ID]["show"]  # 2 SHOW_ID
        ))

        data_output.append(int(dataset_json[ID]["sarcasm"]))

    print('close video_features_file')
    video_features_file.close()

    splits = 5
    skf = StratifiedKFold(n_splits=splits, shuffle=True)
    split_indices = [
        (train_index, test_index)
        for train_index, test_index in skf.split(data_input, data_output)
    ]
    print('split_indices: ')
    # print(split_indices)
    print(split_indices[0][0].shape, split_indices[0][1].shape)
    print(len(split_indices))
    # (552,)(138, )
    # 5

    if not os.path.exists(INDICES_FILE):
        pickle.dump(split_indices, open(INDICES_FILE, 'wb'), protocol=2)

    split_indices = pickle_loader(INDICES_FILE)
    print('after pickle_loader: ')
    print(split_indices[0][0].shape, split_indices[0][1].shape)
    print(len(split_indices))

    def get_data_loader(train_ind_SI):
        dataLoader = None
        # (text,video,AUDIO)
        train_input = [data_input[ind] for ind in train_ind_SI]
        # [0 1 0 1 ]
        train_out = np.array([data_output[ind] for ind in train_ind_SI])
        # expand dim (n,)  (n,1) it may be useless for crossentropy
        train_out = np.expand_dims(train_out, axis=1)

        def getData(ID=None):
            return [instance[ID] for instance in train_input]

        # Text Feature
        train_text_feature = getData(TEXT_ID)
        # video Feature
        train_video_feature = getData(VIDEO_ID)
        train_video_feature_mean = np.array([
            np.mean(feature_vector, axis=0)
            for feature_vector in train_video_feature
        ])
        # audio Feature
        audio = getData(AUDIO_ID)
        # (552, 283)
        train_audio_feature = np.array(
            [np.mean(feature_vector, axis=1) for feature_vector in audio])
        train_dataset = MMDataset(train_text_feature, train_video_feature_mean,
                                  train_audio_feature, train_out)
        train_dataLoader = DataLoader(train_dataset,
                                      batch_size=BATCH_SIZE,
                                      num_workers=0,
                                      shuffle=True)
        dataLoader = train_dataLoader

        return dataLoader

    class SubNet(nn.Module):
        def __init__(self, in_size, hidden_size, dropout):
            super(SubNet, self).__init__()
            self.norm = nn.BatchNorm1d(in_size)
            self.drop = nn.Dropout(p=dropout)
            self.linear_1 = nn.Linear(in_size, hidden_size)
            self.linear_2 = nn.Linear(hidden_size, hidden_size)
            self.linear_3 = nn.Linear(hidden_size, hidden_size)

        def forward(self, x):
            normed = self.norm(x)
            dropped = self.drop(normed)
            y = F.relu(self.linear_1(dropped))
            # y = self.linear_1(dropped)
            y = F.relu(self.linear_2(y))
            y = F.relu(self.linear_3(y))
            out = y
            return out

    # for audio only have 283 so small the model to avoid overfitting

    class SubAudioNet(nn.Module):
        def __init__(self, in_size, hidden_size, dropout):
            super(SubAudioNet, self).__init__()
            self.norm = nn.BatchNorm1d(in_size)
            self.drop = nn.Dropout(p=dropout)
            self.linear_1 = nn.Linear(in_size, hidden_size)
            self.linear_2 = nn.Linear(hidden_size, hidden_size)
            self.linear_3 = nn.Linear(hidden_size, hidden_size)

        def forward(self, x):
            normed = self.norm(x)
            dropped = self.drop(normed)
            y = F.relu(self.linear_1(dropped))
            # y = self.linear_1(dropped)
            y = F.relu(self.linear_2(y))
            y = F.relu(self.linear_3(y))
            out = y
            return out

    class LF_DNN(nn.Module):
        def __init__(self):
            super(LF_DNN, self).__init__()
            self.text_in, self.video_in, self.audio_in = 768, 2048, 283
            self.text_hidden, self.video_hidden, self.audio_hidden = 128, 128, 32
            # self.text_out = 32
            self.post_fusion_dim1 = 256
            self.post_fusion_dim = 32
            self.video_prob, self.text_prob, self.audio_prob, self.post_fusion_prob = (
                0.2, 0.2, 0.2, 0.2)
            self.video_subnet = SubNet(self.video_in, self.video_hidden,
                                       self.video_prob)
            self.audio_subnet = SubAudioNet(self.audio_in, self.audio_hidden,
                                            self.audio_prob)
            self.text_subnet = SubNet(self.text_in, self.text_hidden,
                                      self.text_prob)

            self.post_fusion_dropout = nn.Dropout(p=self.post_fusion_prob)
            self.post_fusion_dropout1 = nn.Dropout(p=self.post_fusion_prob)
            self.post_fusion_layer_1 = nn.Linear(
                self.text_hidden + self.audio_hidden + self.text_in +
                self.audio_in, self.post_fusion_dim1)
            self.post_fusion_layer_4 = nn.Linear(self.post_fusion_dim1,
                                                 self.post_fusion_dim1)
            self.post_fusion_layer_5 = nn.Linear(self.post_fusion_dim1,
                                                 self.post_fusion_dim1)
            # self.post_fusion_layer_2 = nn.Linear(self.post_fusion_dim1, self.post_fusion_dim)

            self.post_fusion_layer_6 = nn.Linear(
                self.post_fusion_dim1 + self.text_in + self.audio_in,
                self.post_fusion_dim1)

            self.post_fusion_layer_7 = nn.Linear(self.post_fusion_dim1,
                                                 self.post_fusion_dim1)
            self.post_fusion_layer_8 = nn.Linear(self.post_fusion_dim1,
                                                 self.post_fusion_dim)

            self.post_fusion_layer_3 = nn.Linear(self.post_fusion_dim, 2)

        def forward(self, text_x, video_x, audio_x):
            video_h = self.video_subnet(video_x)
            audio_h = self.audio_subnet(audio_x)
            text_h = self.text_subnet(text_x)
            # 128+32+16 = 176
            fusion_h = torch.cat([text_h, audio_h, text_x, audio_x], dim=-1)

            x = self.post_fusion_dropout(fusion_h)
            # x = self.post_fusion_layer_1(x)
            x = F.relu(self.post_fusion_layer_1(x), inplace=True)
            x = F.relu(self.post_fusion_layer_4(x), inplace=True)
            x = F.relu(self.post_fusion_layer_5(x), inplace=True)
            # x = F.relu(self.post_fusion_layer_2(x), inplace=True)

            x = torch.cat([x, text_x, audio_x], dim=-1)

            x = self.post_fusion_dropout1(x)
            x = F.relu(self.post_fusion_layer_6(x), inplace=True)
            x = F.relu(self.post_fusion_layer_7(x), inplace=True)
            x = F.relu(self.post_fusion_layer_8(x), inplace=True)

            output = self.post_fusion_layer_3(x)
            return output

    # model = SubNet(2048,128,0.2)
    # model1 = SubNet(768,32,0.2)
    model2 = LF_DNN()
    model2.to(device)
    # summary(model,(2048,))
    # summary(model1,(768,))
    summary(model2, [(768, ), (2048, ), (283, )])
    # summary(model, [(1, 16, 16), (1, 28, 28)])

    learning_rate = 5e-4
    weight_decay = 0.0
    early_stop = 20

    def do_test(model2, dataLoader, mode="VAL"):
        criterion = nn.CrossEntropyLoss()
        model2.eval()
        y_pred, y_true = [], []
        eval_loss = 0.0
        eval_acc = 0.0
        with torch.no_grad():
            with tqdm(dataLoader) as td:
                for batch_data in td:
                    vision = batch_data['vision'].to(device)
                    text = batch_data['text'].to(device)
                    audio = batch_data['audio'].to(device)
                    labels = batch_data['labels'].to(device)

                    outputs = model2(text, vision, audio)

                    loss = criterion(outputs, labels.squeeze())

                    eval_loss += loss.item()
                    eval_acc += (outputs.argmax(1) == torch.squeeze(
                        labels.long())).sum().item()

                    y_pred.append(outputs.argmax(1).cpu())
                    y_true.append(labels.squeeze().long().cpu())

        pred, true = torch.cat(y_pred), torch.cat(y_true)

        eval_loss = eval_loss / len(pred)
        eval_acc = eval_acc / len(pred)
        # print('len dataLoader:',len(dataLoader))  1
        print("%s-(%s) >> loss: %.4f acc: %.4f" %
              (mode, 'lf_dnn', eval_loss, eval_acc))

        return eval_acc, pred, true

    def do_train(model2, train_dataLoader, val_dataLoader):
        best_acc = 0
        epochs, best_epoch = 0, 0
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model2.parameters(),
                               lr=learning_rate,
                               weight_decay=weight_decay)
        while True:
            epochs += 1
            y_pred, y_true = [], []
            model2.train()
            train_loss = 0.0
            train_acc = 0.0
            with tqdm(train_dataLoader) as td:
                for batch_data in td:
                    vision = batch_data['vision'].to(device)
                    audio = batch_data['audio'].to(device)
                    text = batch_data['text'].to(device)

                    labels = batch_data['labels'].to(device)

                    optimizer.zero_grad()
                    # forward
                    outputs = model2(text, vision, audio)

                    loss = criterion(outputs, labels.squeeze())
                    # backward
                    loss.backward()
                    # update
                    optimizer.step()
                    train_loss += loss.item()

                    train_acc += (outputs.argmax(1) == torch.squeeze(
                        labels.long())).sum().item()

                    y_pred.append(outputs.argmax(1).cpu())
                    y_true.append(labels.squeeze().long().cpu())

            pred, true = torch.cat(y_pred), torch.cat(y_true)

            train_loss = train_loss / len(pred)

            train_acc = train_acc / len(pred)

            print(
                "TRAIN-(%s) (%d/%d)>> loss: %.4f train_acc: %.4f" %
                ('lf_dnn', epochs - best_epoch, epochs, train_loss, train_acc))

            val_acc, y_pred, y_true = do_test(model2,
                                              val_dataLoader,
                                              mode="VAL")
            # change the best value : weighted avg
            result_dict = classification_report(y_true,
                                                y_pred,
                                                digits=3,
                                                output_dict=True)

            val_acc = result_dict["weighted avg"]["f1-score"]
            print(f'weighted avg f1-score {val_acc}')

            if val_acc > best_acc:
                best_acc, best_epoch = val_acc, epochs
                print(model_path)
                if os.path.exists(model_path):
                    os.remove(model_path)
                torch.save(model2.cpu().state_dict(), model_path)
                model2.to(device)

            # early stop
            if epochs - best_epoch >= early_stop:
                print(f'the best epochs:{best_epoch},the best acc:{best_acc}')
                return
                # break

    results = []

    def five_fold():
        def getSpeakerIndependent():
            train_ind_SI, test_ind_SI = [], []
            for ind, data in enumerate(data_input):
                if data[SHOW_ID] == "FRIENDS":
                    test_ind_SI.append(ind)
                else:
                    train_ind_SI.append(ind)
            train_index, test_index = train_ind_SI, test_ind_SI
            return np.array(train_index), np.array(test_index)

        def getSpeakerIndependent_ours():
            train_ind_SI, test_ind_SI = [], []
            test_speakers = ['HOWARD', 'SHELDON']
            for idx, ID in enumerate(dataset_json.keys()):
                speaker = dataset_json[ID]["speaker"]
                if speaker in test_speakers:
                    test_ind_SI.append(idx)
                else:
                    train_ind_SI.append(idx)

            train_index, test_index = train_ind_SI, test_ind_SI
            return np.array(train_index), np.array(test_index)

        for fold in range(5):
            print(fold, '-' * 50)
            # (train_index, test_index) = getSpeakerIndependent()
            (train_index, test_index) = getSpeakerIndependent_ours()

            train_ind_SI = train_index
            val_ind_SI = test_index
            test_ind_SI = test_index

            print(train_ind_SI.shape, val_ind_SI.shape, test_ind_SI.shape)
            train_dataLoader = get_data_loader(train_ind_SI)
            val_dataLoader = get_data_loader(val_ind_SI)
            test_dataLoader = get_data_loader(test_ind_SI)
            model2 = LF_DNN()
            model2.to(device)

            do_train(model2, train_dataLoader, val_dataLoader)
            print()
            print(f'load:{model_path}')
            model2.load_state_dict(torch.load(model_path))
            model2.to(device)
            # do test
            val_acc, y_pred, y_true = do_test(model2,
                                              test_dataLoader,
                                              mode="TEST")
            print('Test: ', val_acc)
            # print(pred,true)
            result_string = classification_report(y_true, y_pred, digits=3)
            print('confusion_matrix(y_true, y_pred)')
            print(confusion_matrix(y_true, y_pred))
            print(result_string)

            result_dict = classification_report(y_true,
                                                y_pred,
                                                digits=3,
                                                output_dict=True)
            results.append(result_dict)

        # Dumping result to output
        if not os.path.exists(os.path.dirname(RESULT_FILE)):
            os.makedirs(os.path.dirname(RESULT_FILE))
        with open(RESULT_FILE.format(model_name), 'w') as file:
            json.dump(results, file)
        print('dump results  into ', RESULT_FILE.format(model_name))

        return None

    five_fold()

    def printResult(model_name=None):
        results = json.load(open(RESULT_FILE.format(model_name), "rb"))
        weighted_precision, weighted_recall = [], []
        weighted_fscores = []
        print("#" * 20)
        for fold, result in enumerate(results):
            weighted_fscores.append(result["weighted avg"]["f1-score"])
            weighted_precision.append(result["weighted avg"]["precision"])
            weighted_recall.append(result["weighted avg"]["recall"])
            print("Fold {}:".format(fold + 1))
            print(
                "Weighted Precision: {}  Weighted Recall: {}  Weighted F score: {}"
                .format(result["weighted avg"]["precision"],
                        result["weighted avg"]["recall"],
                        result["weighted avg"]["f1-score"]))
        print("#" * 20)
        print("Avg :")
        print(
            "Weighted Precision: {:.3f}  Weighted Recall: {:.3f}  Weighted F score: {:.3f}"
            .format(np.mean(weighted_precision), np.mean(weighted_recall),
                    np.mean(weighted_fscores)))

        tmp = {
            'precision:': np.mean(weighted_precision),
            'recall': np.mean(weighted_recall),
            'f1': np.mean(weighted_fscores)
        }

        file_name = 'five_results_average'
        with open(RESULT_FILE.format(file_name), 'w') as file:
            json.dump(tmp, file)

    printResult(model_name=model_name)
Beispiel #15
0
 def _load(self) -> List:
     with jsonlines.open(self._filepath, mode='r') as f:
         image_info = list(f)
     return image_info
Beispiel #16
0
    def _read_dataset(self):
        """Open news dataset and divides contents by category."""
        print(
            "Opening dataset News Category Dataset v2 (200k entries) and splitting data..."
        )
        with jsonlines.open(
                os.path.join(ROOT, 'classifier', 'datasets',
                             'News_Category_Dataset_v2.json')) as news:
            for item in news.iter(type=dict, skip_invalid=True):
                cat = item['category'].lower()

                if 'style' in cat or 'home' in cat:
                    self.categories['lifestyle'].append([
                        self.format_sentence(item['headline'].lower()),
                        'Lifestyle'
                    ])
                    self.categories['lifestyle'].append([
                        self.format_sentence(
                            item['short_description'].lower()), 'Lifestyle'
                    ])

                if 'food' in cat or 'taste' in cat:
                    self.categories['food'].append([
                        self.format_sentence(item['headline'].lower()), 'Food'
                    ])
                    self.categories['food'].append([
                        self.format_sentence(
                            item['short_description'].lower()), 'Food'
                    ])

                if 'art' in cat:
                    self.categories['arts'].append([
                        self.format_sentence(item['headline'].lower()), 'Arts'
                    ])
                    self.categories['arts'].append([
                        self.format_sentence(
                            item['short_description'].lower()), 'Arts'
                    ])

                if 'healthy' in cat:
                    self.categories['health'].append([
                        self.format_sentence(item['headline'].lower()),
                        'Health'
                    ])
                    self.categories['health'].append([
                        self.format_sentence(
                            item['short_description'].lower()), 'Health'
                    ])

                if cat in self.categories.keys():
                    self.categories[cat].append([
                        self.format_sentence(item['headline'].lower()),
                        cat.title()
                    ])
                    self.categories[cat].append([
                        self.format_sentence(
                            item['short_description'].lower()),
                        cat.title()
                    ])

        print(
            "Done splitting data. Opening UCI News Aggregator (400k entries) and splitting data..."
        )

        with open(
                os.path.join(ROOT, 'classifier', 'datasets',
                             'uci-news-aggregator.csv'), ) as input_csv:
            news_reader = csv.reader(input_csv, delimiter=",")
            for row in news_reader:
                if row[4] == 'b':
                    self.categories['business'].append(
                        [self.format_sentence(row[1].lower()), 'Business'])

                if row[4] == 't':
                    self.categories['tech'].append(
                        [self.format_sentence(row[1].lower()), 'Tech'])

                if row[4] == 'e':
                    self.categories['entertainment'].append([
                        self.format_sentence(row[1].lower()), 'Entertainment'
                    ])
                if row[4] == 'm':
                    self.categories['health'].append(
                        [self.format_sentence(row[1].lower()), 'Health'])
        print("Done splitting ")
Beispiel #17
0
 def open_file(self):
     return jsonlines.open(self.converted_filename, mode="w")
Beispiel #18
0
# import warnings
# warnings.filterwarnings("error")

import sys
print(sys.getrecursionlimit())
sys.setrecursionlimit(5000)

for file in os.listdir(os.path.join(prefix, 'clean_final')):
    path = os.path.join(os.path.join(prefix, 'clean_final', file))
    data = []

    if 'train' in file:
        continue

    with jsonlines.open(path) as reader:
        for obj in reader:
            data.append(obj)

    all_scores = defaultdict(dict)

    i = 0

    final_documents = {}
    for bill in data:
        i += 1
        if i % 50 == 0:
            print(i)

        summary = bill['clean_summary']
        doc = bill['clean_text']
Beispiel #19
0
        'f1_support': round(f1_support, 4),
        'precision_support': round(precision_support, 4),
        'recall_support': round(recall_support, 4),
        'f1_contradict': round(f1_contradict, 4),
        'precision_contradict': round(precision_contradict, 4),
        'recall_contradict': round(recall_contradict, 4)
    }


parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, required=True)
parser.add_argument('--rationale-selection', type=str, required=True)
parser.add_argument('--label-prediction', type=str, required=True)
args = parser.parse_args()

dataset = {data['id']: data for data in jsonlines.open(args.dataset)}
rationale_selection = list(jsonlines.open(args.rationale_selection))
label_prediction = jsonlines.open(args.label_prediction)


def get_gold_label(claim_id: int, doc_id: int):
    labels = {
        es['label']
        for es in dataset[claim_id]['evidence'].get(str(doc_id)) or []
    }
    if labels:
        return next(iter(labels))
    else:
        return 'NOT_ENOUGH_INFO'

Beispiel #20
0
def write_jsonlines(array, file):
    with jsonlines.open(file, mode='w') as writer:
        writer.write_all(array)
Beispiel #21
0
def read_jsonlines(file, handler=lambda obj: obj):
    dat = []
    with jsonlines.open(file) as reader:
        for obj in reader:
            dat.append(handler(obj))
    return dat
Beispiel #22
0
 def test_jsonl(self):
     obj = jsonlines.open(imd_data_, mode='r')
     for i in obj:
         print i
Beispiel #23
0
        print("inside sent ret")
        dataset_paths = [
            "fever_full_binary_dev_sent_ret_split1",
            "fever_full_binary_dev_sent_ret_split2",
            "fever_full_binary_dev_sent_ret_split3"
        ]
        model_path = "models_fever_full/sentence_retrieval_models/model_bert_fever_full_binaryAcc73.h5"
        model = load_model(model_path)
        print("model loaded")
        embeddings_paths = [
            "fever_full_dev_binary_sent_ret_bert_60k",
            "fever_full_dev_binary_sent_ret_bert_60k_120k",
            "fever_full_dev_binary_sent_ret_bert_120k_plus"
        ]

        results = jsonlines.open(sr_results_path, mode="w")

        for i in range(len(dataset_paths)):

            dataset_path = "/scratch/kkuma12s/github/fact-validation/thesis-code/Proof_Extraction/data/fever-full/complete_pipeline/sent_ret/bert/" + dataset_paths[
                i] + ".jsonl"
            test_model = testModel(dataset_path, model_path)
            claims_sent_vec_combined = test_model.load_compressed_pickle_file(
                "/scratch/kkuma12s/new_embeddings/" + embeddings_paths[i])
            print("test data size ", test_model.test_data.shape)
            print("inside sentence retrieval and loading bert embeddings ")
            print("claims vec 1 ", claims_sent_vec_combined.shape)
            batch_size = 32
            total_possible_batch_sizes = len(
                claims_sent_vec_combined) / batch_size
            print("total possible batch sizes ", total_possible_batch_sizes)
def load_rows(search_collection):
    return list(jsonlines.open(search_collection))
Beispiel #25
0
    WHERE EXISTS (
    SELECT new_path FROM UNNEST(difference)
    WHERE (new_path LIKE "%.py")
    )
    AND regexp_contains(subject, 'bug|fix|issue|error')
"""


query_job = client.query(lang_query)  # Make an API request.

print("Querying languages:")
counter = 0

headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': 'token ' + git_token}

with jsonlines.open('data/data.jsonl', mode='w') as writer:
    for row in query_job:
        commit_sha = row.commit
        
        repo = row.repo_name[0] if (isinstance(row.repo_name, list)) else row.repo_name

        for diff in row.difference:
            
            old_file_path = diff['old_path']
            new_file_path = diff['new_path']

            url_before = "https://api.github.com/repos/%s/contents/%s?ref=%s" % (repo, old_file_path, commit_sha)
            url_after = "https://api.github.com/repos/%s/contents/%s?ref=%s" % (repo, new_file_path, commit_sha + '^')
            
            try:
Beispiel #26
0
download = False
sc_folder = ".\\media\\sc\\"
mov_folder = ".\\media\\mov\\"

with open('top_games.json') as json_file:
    games = json.load(json_file)

# Example game from top_games.json:
    # "570": {appid=570, name="Dota 2", developer="Valve", publisher="Valve", score_rank="", positive=1206694,
    #             negative=221642, userscore=0, owners="100,000,000 .. 200,000,000", average_forever=36567,
    #             average_2weeks=1517, median_forever=1236, median_2weeks=741, price="0", initialprice="0",
    #             discount="0", ccu=618777}


start_index = 5041
with jsonlines.open("detailed_games.jsonl", "a") as detailed_games:
    # Iterate through all top games
    for index in range(start_index, len(games.keys())):

        game = games[list(games)[index]]
        name = game["name"]
        clean_name = re.sub(r'[/\\*:?\"<>|]', '', name)  # Clean name for use in file storage
        app_id = str(game["appid"])
        print(str(index) + " Game: " + name)

        success = False
        timeout_count = 0
        while not success:
            try:
                steam_details = requests.get(steam_store_api, params={"appids": app_id, "language": "english"})
                steam_details = steam_details.json()
Beispiel #27
0
    def test_write_out_subsentences(self):

        merged_entity_emb_file = tempfile.NamedTemporaryFile()
        out_file = tempfile.NamedTemporaryFile()
        data_file = tempfile.NamedTemporaryFile()

        entity_dir = "test/entity_db"
        entity_map_dir = "entity_mappings"
        alias_cand_map = "alias2qids.json"
        data_config = DottedDict(entity_dir=entity_dir,
                                 entity_map_dir=entity_map_dir,
                                 alias_cand_map=alias_cand_map)
        entitiy_symbols = EntitySymbolsSubclass()
        entitiy_symbols.dump(save_dir=os.path.join(entity_dir, entity_map_dir))

        num_examples = 3
        total_num_mentions = 7
        M = 3
        K = 2
        hidden_size = 2

        # create data file -- just needs aliases and sentence indices
        data = [{
            'aliases': ['a', 'b'],
            'sent_idx_unq': 0
        }, {
            'aliases': ['c', 'd', 'e', 'f', 'g'],
            'sent_idx_unq': 1
        }]

        with jsonlines.open(data_file.name, 'w') as f:
            for row in data:
                f.write(row)

        merged_storage_type = np.dtype([('hidden_size', int),
                                        ('sent_idx', int),
                                        ('alias_list_pos', int),
                                        ('entity_emb', float, hidden_size),
                                        ('final_loss_pred', int),
                                        ('final_loss_prob', float),
                                        ('final_loss_cand_probs', float, K)])

        merged_entity_emb = np.memmap(merged_entity_emb_file.name,
                                      dtype=merged_storage_type,
                                      mode="w+",
                                      shape=(total_num_mentions, ))
        # 2 sentences, 1st sent has 1 subsentence, 2nd sentence has 2 subsentences - 7 mentions total
        merged_entity_emb['hidden_size'] = hidden_size
        # first men
        merged_entity_emb[0]['sent_idx'] = 0
        merged_entity_emb[0]['alias_list_pos'] = 0
        merged_entity_emb[0]['entity_emb'] = np.array([0, 1])
        merged_entity_emb[0]['final_loss_pred'] = 1
        merged_entity_emb[0]['final_loss_prob'] = 0.9
        merged_entity_emb[0]['final_loss_cand_probs'] = np.array([0.1, 0.9])
        # second men
        merged_entity_emb[1]['sent_idx'] = 0
        merged_entity_emb[1]['alias_list_pos'] = 1
        merged_entity_emb[1]['entity_emb'] = np.array([2, 3])
        merged_entity_emb[1]['final_loss_pred'] = 1
        merged_entity_emb[1]['final_loss_prob'] = 0.9
        merged_entity_emb[1]['final_loss_cand_probs'] = np.array([0.1, 0.9])
        # third men
        merged_entity_emb[2]['sent_idx'] = 1
        merged_entity_emb[2]['alias_list_pos'] = 0
        merged_entity_emb[2]['entity_emb'] = np.array([4, 5])
        merged_entity_emb[2]['final_loss_pred'] = 0
        merged_entity_emb[2]['final_loss_prob'] = 0.9
        merged_entity_emb[2]['final_loss_cand_probs'] = np.array([0.9, 0.1])
        # fourth men
        merged_entity_emb[3]['sent_idx'] = 1
        merged_entity_emb[3]['alias_list_pos'] = 1
        merged_entity_emb[3]['entity_emb'] = np.array([6, 7])
        merged_entity_emb[3]['final_loss_pred'] = 0
        merged_entity_emb[3]['final_loss_prob'] = 0.9
        merged_entity_emb[3]['final_loss_cand_probs'] = np.array([0.9, 0.1])
        # fifth men
        merged_entity_emb[4]['sent_idx'] = 1
        merged_entity_emb[4]['alias_list_pos'] = 2
        merged_entity_emb[4]['entity_emb'] = np.array([10, 11])
        merged_entity_emb[4]['final_loss_pred'] = 1
        merged_entity_emb[4]['final_loss_prob'] = 0.9
        merged_entity_emb[4]['final_loss_cand_probs'] = np.array([0.1, 0.9])
        # sixth men
        merged_entity_emb[5]['sent_idx'] = 1
        merged_entity_emb[5]['alias_list_pos'] = 3
        merged_entity_emb[5]['entity_emb'] = np.array([12, 13])
        merged_entity_emb[5]['final_loss_pred'] = 1
        merged_entity_emb[5]['final_loss_prob'] = 0.9
        merged_entity_emb[5]['final_loss_cand_probs'] = np.array([0.1, 0.9])
        # seventh men
        merged_entity_emb[6]['sent_idx'] = 1
        merged_entity_emb[6]['alias_list_pos'] = 4
        merged_entity_emb[6]['entity_emb'] = np.array([14, 15])
        merged_entity_emb[6]['final_loss_pred'] = 1
        merged_entity_emb[6]['final_loss_prob'] = 0.9
        merged_entity_emb[6]['final_loss_cand_probs'] = np.array([0.1, 0.9])

        num_processes = 2
        train_in_candidates = True
        dump_embs = True

        write_data_labels(num_processes, merged_entity_emb_file.name,
                          merged_storage_type, data_file.name, out_file.name,
                          train_in_candidates, dump_embs, data_config)
        '''
          "a":[["Q1",10.0],["Q4",6]],
          "b":[["Q2",5.0],["Q1",3]],
          "c":[["Q1",30.0],["Q2",3]],
          "d":[["Q4",20],["Q3",15.0]],
          "e":[["Q1",10.0],["Q4",6]],
          "f":[["Q2",5.0],["Q1",3]],
          "g":[["Q1",30.0],["Q2",3]]
        '''
        all_lines = []
        with open(out_file.name) as check_f:
            for line in check_f:
                all_lines.append(ujson.loads(line))

        gold_lines = [{
            'sent_idx_unq': 0,
            'aliases': ['a', 'b'],
            'qids': ["Q4", "Q1"],
            'probs': [0.9, 0.9],
            'cands': [["Q1", "Q4"], ["Q2", "Q1"]],
            'cand_probs': [[0.1, 0.9], [0.1, 0.9]],
            'entity_ids': [4, 1],
            'ctx_emb_ids': [0, 1]
        }, {
            'sent_idx_unq':
            1,
            'aliases': ['c', 'd', 'e', 'f', 'g'],
            'qids': ["Q1", "Q4", "Q4", "Q1", "Q2"],
            'probs': [0.9, 0.9, 0.9, 0.9, 0.9],
            'cands': [["Q1", "Q2"], ["Q4", "Q3"], ["Q1", "Q4"], ["Q2", "Q1"],
                      ["Q1", "Q2"]],
            'cand_probs': [[0.9, 0.1], [0.9, 0.1], [0.1, 0.9], [0.1, 0.9],
                           [0.1, 0.9]],
            'entity_ids': [1, 4, 4, 1, 2],
            'ctx_emb_ids': [2, 3, 4, 5, 6]
        }]

        assert len(all_lines) == len(gold_lines)
        for i in range(len(gold_lines)):
            self.assertDictEqual(
                gold_lines[i], all_lines[i],
                f"{ujson.dumps(gold_lines[i], indent=4)} VS {ujson.dumps(all_lines[i], indent=4)}"
            )

        # clean up
        if os.path.exists(entity_dir):
            shutil.rmtree(entity_dir)
        merged_entity_emb_file.close()
        out_file.close()
        data_file.close()
Beispiel #28
0
def prepare_SLURP(data_folder,
                  save_folder,
                  slu_type,
                  train_splits,
                  skip_prep=False):
    """
    This function prepares the SLURP dataset.
    If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.

    data_folder : path to SLURP dataset.
    save_folder: path where to save the csv manifest files.
    slu_type : one of the following:

      "direct":{input=audio, output=semantics}
      "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle)
      "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts)

    train_splits : list of splits to be joined to form train .csv
    skip_prep: If True, data preparation is skipped.
    """
    if skip_prep:
        return
    # If the data folders do not exist, we need to download/extract the data
    if not os.path.isdir(os.path.join(data_folder, "slurp_synth")):
        # Check for zip file and download if it doesn't exist
        zip_location = os.path.join(data_folder, "slurp_synth.tar.gz")
        if not os.path.exists(zip_location):
            url = "https://zenodo.org/record/4274930/files/slurp_synth.tar.gz?download=1"
            download_file(url, zip_location, unpack=True)
        else:
            print("Extracting slurp_synth...")
            shutil.unpack_archive(zip_location, data_folder)

    if not os.path.isdir(os.path.join(data_folder, "slurp_real")):
        # Check for zip file and download if it doesn't exist
        zip_location = os.path.join(data_folder, "slurp_real.tar.gz")
        if not os.path.exists(zip_location):
            url = "https://zenodo.org/record/4274930/files/slurp_real.tar.gz?download=1"
            download_file(url, zip_location, unpack=True)
        else:
            print("Extracting slurp_real...")
            shutil.unpack_archive(zip_location, data_folder)

    splits = [
        "train_real",
        "train_synthetic",
        "devel",
        "test",
    ]
    id = 0
    for split in splits:
        new_filename = (os.path.join(save_folder, split) +
                        "-type=%s.csv" % slu_type)
        if os.path.exists(new_filename):
            continue
        print("Preparing %s..." % new_filename)

        IDs = []
        duration = []

        wav = []
        wav_format = []
        wav_opts = []

        semantics = []
        semantics_format = []
        semantics_opts = []

        transcript = []
        transcript_format = []
        transcript_opts = []

        jsonl_path = os.path.join(data_folder, split + ".jsonl")
        if not os.path.isfile(jsonl_path):
            if split == "train_real":
                url_split = "train"
            else:
                url_split = split
            url = (
                "https://github.com/pswietojanski/slurp/raw/master/dataset/slurp/"
                + url_split + ".jsonl")
            download_file(url, jsonl_path, unpack=False)

        with jsonlines.open(jsonl_path) as reader:
            for obj in reader:
                scenario = obj["scenario"]
                action = obj["action"]
                sentence_annotation = obj["sentence_annotation"]
                num_entities = sentence_annotation.count("[")
                entities = []
                for slot in range(num_entities):
                    type = (sentence_annotation.split("[")[slot + 1].split("]")
                            [0].split(":")[0].strip())
                    filler = (sentence_annotation.split("[")[slot + 1].split(
                        "]")[0].split(":")[1].strip())
                    entities.append({"type": type, "filler": filler})
                for recording in obj["recordings"]:
                    IDs.append(id)
                    if "synthetic" in split:
                        audio_folder = "slurp_synth/"
                    else:
                        audio_folder = "slurp_real/"
                    path = os.path.join(data_folder, audio_folder,
                                        recording["file"])
                    signal = read_audio(path)
                    duration.append(signal.shape[0] / 16000)

                    wav.append(path)
                    wav_format.append("flac")
                    wav_opts.append(None)

                    transcript_ = obj["sentence"]
                    if slu_type == "decoupled":
                        transcript_ = transcript_.upper()
                    transcript.append(transcript_)
                    transcript_format.append("string")
                    transcript_opts.append(None)

                    semantics_dict = {
                        "scenario": scenario,
                        "action": action,
                        "entities": entities,
                    }
                    semantics_ = str(semantics_dict).replace(
                        ",", "|"
                    )  # Commas in dict will make using csv files tricky; replace with pipe.
                    semantics.append(semantics_)
                    semantics_format.append("string")
                    semantics_opts.append(None)
                    id += 1

        df = pd.DataFrame({
            "ID": IDs,
            "duration": duration,
            "wav": wav,
            "semantics": semantics,
            "transcript": transcript,
        })
        df.to_csv(new_filename, index=False)

    # Merge train splits
    train_splits = [
        split + "-type=%s.csv" % slu_type for split in train_splits
    ]
    merge_csvs(save_folder, train_splits, "train-type=%s.csv" % slu_type)
Beispiel #29
0
    def test_merge_subsentences(self):

        test_full_emb_file = tempfile.NamedTemporaryFile()
        test_merged_emb_file = tempfile.NamedTemporaryFile()
        gold_merged_emb_file = tempfile.NamedTemporaryFile()

        num_examples = 3
        total_num_mentions = 7
        M = 3
        K = 2
        hidden_size = 2

        # create full embedding file
        storage_type_full = np.dtype([('M', int), ('K', int),
                                      ('hidden_size', int), ('sent_idx', int),
                                      ('subsent_idx', int),
                                      ('alias_list_pos', int, M),
                                      ('entity_emb', float, M * hidden_size),
                                      ('final_loss_true', int, M),
                                      ('final_loss_pred', int, M),
                                      ('final_loss_prob', float, M),
                                      ('final_loss_cand_probs', float, M * K)])
        full_emb = np.memmap(test_full_emb_file.name,
                             dtype=storage_type_full,
                             mode='w+',
                             shape=(num_examples, ))

        # 2 sentences, 1st sent has 1 subsentence, 2nd sentence has 2 subsentences
        # first sentence
        full_emb['hidden_size'] = hidden_size
        full_emb['M'] = M
        full_emb['K'] = K
        full_emb[0]['sent_idx'] = 0
        full_emb[0]['subsent_idx'] = 0
        # last alias is padded
        full_emb[0]['alias_list_pos'] = np.array([0, 1, -1])
        full_emb[0]['final_loss_true'] = np.array([0, 1, -1])
        # entity embs are flattened
        full_emb[0]['entity_emb'] = np.array([0, 1, 2, 3, 0, 0])

        full_emb[1]['sent_idx'] = 1
        full_emb[1]['subsent_idx'] = 0
        full_emb[1]['alias_list_pos'] = np.array([0, 1, 2])
        # last alias goes with next subsentence
        full_emb[1]['final_loss_true'] = np.array([1, 1, -1])
        full_emb[1]['entity_emb'] = np.array([4, 5, 6, 7, 8, 9])

        full_emb[2]['sent_idx'] = 1
        full_emb[2]['subsent_idx'] = 1
        full_emb[2]['alias_list_pos'] = np.array([2, 3, 4])
        full_emb[2]['final_loss_true'] = np.array([1, 1, 1])
        full_emb[2]['entity_emb'] = np.array([10, 11, 12, 13, 14, 15])

        # create merged embedding file
        storage_type_merged = np.dtype([('hidden_size', int),
                                        ('sent_idx', int),
                                        ('alias_list_pos', int),
                                        ('entity_emb', float, hidden_size),
                                        ('final_loss_pred', int),
                                        ('final_loss_prob', float),
                                        ('final_loss_cand_probs', float, K)])
        merged_emb_gold = np.memmap(gold_merged_emb_file.name,
                                    dtype=storage_type_merged,
                                    mode="w+",
                                    shape=(total_num_mentions, ))
        merged_emb_gold['entity_emb'] = np.array([[0, 1], [2, 3], [4, 5],
                                                  [6, 7], [10, 11], [12, 13],
                                                  [14, 15]])

        # create data file -- just needs aliases and sentence indices
        data = [{
            'aliases': ['a', 'b'],
            'sent_idx_unq': 0
        }, {
            'aliases': ['c', 'd', 'e', 'f', 'g'],
            'sent_idx_unq': 1
        }]

        temp_file = tempfile.NamedTemporaryFile(delete=False).name
        with jsonlines.open(temp_file, 'w') as f:
            for row in data:
                f.write(row)

        # assert that output of merge_subsentences is correct
        num_processes = 2
        eval_utils.merge_subsentences(num_processes,
                                      temp_file,
                                      test_merged_emb_file.name,
                                      storage_type_merged,
                                      test_full_emb_file.name,
                                      storage_type_full,
                                      dump_embs=True)
        bootleg_merged_emb = np.memmap(test_merged_emb_file.name,
                                       dtype=storage_type_merged,
                                       mode="r+")
        merged_emb_gold = np.memmap(gold_merged_emb_file.name,
                                    dtype=storage_type_merged,
                                    mode="r+")
        assert len(bootleg_merged_emb) == total_num_mentions
        for i in range(len(bootleg_merged_emb)):
            assert np.array_equal(bootleg_merged_emb[i]['entity_emb'],
                                  merged_emb_gold[i]['entity_emb'])

        # clean up
        if os.path.exists(temp_file):
            os.remove(temp_file)
        test_full_emb_file.close()
        test_merged_emb_file.close()
        gold_merged_emb_file.close()
def create_current_writer(i):
    chunk_no = math.floor(i / CHUNK_SIZE)
    output_filename = OUT_DIR + '/ocdata.chunk' + str(chunk_no) + '.jsonl'
    current_writer = jsonlines.open(output_filename, mode='w')
    return current_writer
Beispiel #31
0
def write_jsonl(filename, items):
  with jsonlines.open(filename, mode='w', dumps=json.JSONEncoder(default=json_serial).encode) as writer:
    writer.write_all(items)
Beispiel #32
0
def write_jsonlist(list_of_json_objects, output_filename):
    with jsonlines.open(output_filename, mode='w') as writer:
        writer.write_all(list_of_json_objects)
Beispiel #33
0
parser = argparse.ArgumentParser()
parser.add_argument('--corpus', type=str, required=True)
parser.add_argument('--dataset', type=str, required=True)
parser.add_argument('--model', type=str, required=True)
parser.add_argument('--rationale-selection', type=str, required=True)
parser.add_argument(
    '--mode',
    type=str,
    default='claim_and_rationale',
    choices=['claim_and_rationale', 'only_claim', 'only_rationale'])
parser.add_argument('--output', type=str, required=True)
args = parser.parse_args()

print(args.mode)

corpus = {doc['doc_id']: doc for doc in jsonlines.open(args.corpus)}
dataset = jsonlines.open(args.dataset)
rationale_selection = jsonlines.open(args.rationale_selection)
output = jsonlines.open(args.output, 'w')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device "{device}"')

tokenizer = AutoTokenizer.from_pretrained(args.model)
config = AutoConfig.from_pretrained(args.model, num_labels=3)
model = AutoModelForSequenceClassification.from_pretrained(
    args.model, config=config).eval().to(device)

LABELS = ['CONTRADICT', 'NOT_ENOUGH_INFO', 'SUPPORT']

    for xsf in strinngfeatures:
        f1.append(G_features[xsf])
        G_features[xsf] = 0
    f2 = np.array(f1)
    instructions['instructions'] = l_instructions
    f3.append(f2)
    instructions['features'] = l_features
    return instructions


p_DataSet = []
c1 = 0
f3 = []
y1 = []
y2 = []
with jsonlines.open("train_dataset.jsonl") as reader:
    for _instructions in reader:  #(type=dict, skip_invalid=True):
        p_DataSet.append(Features(_instructions))
        y1.append(G_opt[_instructions['opt']])
        y2.append(G_compiler[_instructions['compiler']])
#np.array([int(c) for c in ])

db = file_archive("DbMalware.txt")
db['featuresnames'] = strinngfeatures
db['featuresdata'] = f3
db['opt'] = y1
db['G_opt'] = G_opt
db['compiler'] = y2
db['G_compiler'] = G_compiler
db.dump()
print('done')
Beispiel #35
0
        if len(sys.argv) == 3:
            if sys.argv[2] == 'n':
                POWER = sys.argv[2]
            elif sys.argv[2] == 'y':
                POWER = sys.argv[2]
            else:
                print(
                    "Specify y for including power and n for not including it e.g.: python bagofwords.py s n"
                )
                exit()
    else:
        print(
            "Specify s for sender or r for receiver e.g.:  python bagofwords.py s"
        )
        exit()
    #import data.  Specify directory path
    data_path = 'data/'

    with jsonlines.open(data_path + 'train.jsonl', 'r') as reader:
        train = list(reader)
    #VAL NOT USED IN LOG REG FOR CONSISTENCY WITH NEURAL
    #with jsonlines.open(data_path+'validation.jsonl', 'r') as reade
    #dev = list(reader)
    with jsonlines.open(data_path + 'test.jsonl', 'r') as reader:
        test = list(reader)

    #spacy used for tokenization
    nlp = English()

    log_reg(train, test)
Beispiel #36
0
def test_invalid_mode():

    with pytest.raises(ValueError) as exc:
        jsonlines.open('foo', mode='foo')
    assert 'mode' in str(exc.value)
Beispiel #37
0
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import jsonlines
from input.natural_questions.test_utils import simplify_nq_example
import json

json_dir = 'v1.0-simplified_nq-dev-all.jsonl'
dict_list = []
with open(json_dir) as f:
    for line in tqdm(f):
        dict_list.append(simplify_nq_example(json.loads(line)))

with jsonlines.open('simplified-nq-valid.jsonl', 'w') as writer:
    writer.write_all(dict_list)
Beispiel #38
0
def w_into_file(name):
    """Save data to .xlsx file."""
    workbook = xlsxwriter.Workbook('excel/' + name + '.xls')
    worksheet = workbook.add_worksheet()

    worksheet.write('A1', MAIN_FIELDS[0])  # Name
    worksheet.write('B1', MAIN_FIELDS[1])  # Specialty
    worksheet.write('C1', MAIN_FIELDS[2])  # Email
    worksheet.write('D1', MAIN_FIELDS[3])  # Emirates
    worksheet.write('E1', MAIN_FIELDS[4])  # Contact
    worksheet.write('F1', MAIN_FIELDS[5])  # Fax
    worksheet.write('G1', MAIN_FIELDS[6])  # Postal Code
    worksheet.write('H1', MAIN_FIELDS[7])  # Website
    worksheet.write('I1', MAIN_FIELDS[8])  # Address
    worksheet.write('J1', MAIN_FIELDS[9])  # Link

    items = len(os.listdir("tmp"))

    i = 0

    for itm in range(items):

        with jsonlines.open("tmp/" + str(itm) + ".jsonl") as reader:

            for k, item in enumerate(reader):
                try:
                    worksheet.write('A' + str(i + 2), item['Name'])
                except KeyError:
                    worksheet.write('A' + str(i + 2), '')
                try:
                    worksheet.write('B' + str(i + 2), item['Specialty:'])
                except KeyError:
                    worksheet.write('B' + str(i + 2), '')
                try:
                    worksheet.write('C' + str(i + 2), item['Email:'])
                except KeyError:
                    worksheet.write('C' + str(i + 2), '')
                try:
                    worksheet.write('D' + str(i + 2), item['Emirates:'])
                except KeyError:
                    worksheet.write('D' + str(i + 2), '')
                try:
                    worksheet.write('E' + str(i + 2), item['Contact:'])
                except KeyError:
                    worksheet.write('E' + str(i + 2), '')
                try:
                    worksheet.write('F' + str(i + 2), item['Fax:'])
                except KeyError:
                    worksheet.write('F' + str(i + 2), '')
                try:
                    worksheet.write('G' + str(i + 2), item['Postal Code:'])
                except KeyError:
                    worksheet.write('G' + str(i + 2), '')
                try:
                    worksheet.write('H' + str(i + 2), item['Website:'])
                except KeyError:
                    worksheet.write('H' + str(i + 2), '')
                try:
                    worksheet.write('I' + str(i + 2), item['Address:'])
                except KeyError:
                    worksheet.write('I' + str(i + 2), '')
                try:
                    worksheet.write('J' + str(i + 2), item['Link'])
                except KeyError:
                    worksheet.write('J' + str(i + 2), '')

                i += 1
    workbook.close()
Beispiel #39
0
def preprocess_qdraw():    
    img_size = 128
    category = 'baseball'
    output = 'data/qdraw_{cat}_{img_size}'.format(cat=category, img_size=img_size)

    if os.path.exists(output):
        import shutil
        shutil.rmtree(output)
    os.mkdir(output)

    stroke_width = 2
    bbox_pad = 20
    cmap = plt.get_cmap('jet')
    need = 200000

    with jsonlines.open('data/{cat}.ndjson'.format(cat=category)) as reader:
        for count, obj in enumerate(reader):
            # print obj
            fn = output + '/' + obj['key_id'] + '.svg'
            if not os.path.isfile(fn):
                print(count, fn)
                dwg = svgwrite.Drawing(fn, profile='tiny', size=(img_size,img_size))
                drawing = obj['drawing']
                num_strokes = len(drawing)
                cnorm = colors.Normalize(vmin=0, vmax=num_strokes-1)
                cscalarmap = cmx.ScalarMappable(norm=cnorm, cmap=cmap)

                # get bbox
                bbox = [100000, 100000, -100000, -100000]
                for i, strokes in enumerate(drawing):
                    x = strokes[0]
                    y = strokes[1]
                    bbox[0] = min(bbox[0], np.amin(x))
                    bbox[1] = min(bbox[1], np.amin(y))
                    bbox[2] = max(bbox[2], np.amax(x))
                    bbox[3] = max(bbox[3], np.amax(y))

                bbox[0] -= bbox_pad
                bbox[1] -= bbox_pad
                bbox[2] += bbox_pad
                bbox[3] += bbox_pad
                # make it square
                dx = bbox[2]-bbox[0]
                dy = bbox[3]-bbox[1]
                b_size = float(max(dx,dy))

                # normalize and save
                for i, strokes in enumerate(drawing):
                    x = (np.asarray(strokes[0]) - bbox[0])/b_size*img_size
                    y = (np.asarray(strokes[1]) - bbox[1])/b_size*img_size
                    # t = strokes[2]
                    c = np.asarray(cscalarmap.to_rgba(i))[:3]*255
                    c_hex = '#%02x%02x%02x' % (int(c[0]), int(c[1]), int(c[2]))
                    dwg.add(dwg.polyline(points=zip(x, y), 
                                        stroke=c_hex,
                                        fill='none',
                                        stroke_width=stroke_width))

                dwg.viewbox(0, 0, img_size, img_size)
                dwg.save()

            if count >= need:
                break

    # split dataset
    file_list = []
    for root, _, files in os.walk(output):
        for file in files:
            file_list.append(file)

    num_files = len(file_list)
    ids = np.random.permutation(num_files)
    train_id = int(num_files * 0.9)
    with open(os.path.join(output,'train.txt'), 'w') as f: 
        for id in ids[:train_id]:
            f.write(file_list[id] + '\n')
    with open(os.path.join(output,'test.txt'), 'w') as f: 
        for id in ids[train_id:]:
            f.write(file_list[id] + '\n')
Beispiel #40
0
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier

X = []
y = []
z = []

with jsonlines.open('data/reviews.txt', 'r') as f:
    for item in f:
        X.append(item['text'])
        y.append(item['voted_up'])
        z.append(item['early_access'])

def detect_text(text):
    try:
        language = detect(text)
    except:
        language = "unknown"

    return language


def stem(tokens, lang):
import raw_data_process as rdp
import jsonlines

with jsonlines.open('vk_comms.jl') as comments:
    with jsonlines.open('norm_comms.jl', mode='w') as writer:
        for obj in comments:
            line = rdp.jl_line(item=obj)
            sentences = line.normalize()
            for item in sentences:
                writer.write(item)
    except:
        pass

    #return (answer_x, answer_binary)

            # If it's accept, convert the label of the correct one to 1, the others to 0, return all
            # If it's reject, convert the label of the presented one to 0, and DELETE the rows in the
            #   matrix/vector. If the presented one is false, we don't know if the other, non-presented
            #   ones were correct or not.

            # return the text labels, too, so we can look at per-country accuracy later.

    #    feat_list.append(feat)

error_count = 0
with jsonlines.open('geo_annotated/geo_country_db.jsonl') as reader:
    X = []
    Y = []
    for obj in reader:
        if obj['answer'] != 'ignore':
            try:
                x, label = entry_to_matrix(obj) # change to return matrices/vectors
                X.append(x)
                Y.append(label)
            except Exception as e:
                error_count += 1
                pass

print(error_count)

# format numpy
Beispiel #43
0
import json
import time
import random
import jsonlines
from requests import *

pp = pprint.PrettyPrinter(width=100)

apikey = ''
apisecret = ''
ping_url = 'https://api-v3.receptiviti.com/v3/api/ping'
content_api_url = 'https://api-v3.receptiviti.com/v3/api/content'

author_texts = pickle.load(open('/home/username/data/output/_jobs/liwc_posts_merged.pickle','rb'))

with jsonlines.open('/home/username/liwc_scores.jsonl', mode='w') as writer:
    for i, (author, text) in author_texts[:1]:
        response_json = test_ping()
        writer.write(response_json)
        print('#' + i + ' - ' + author + ' done.')
        time.sleep(random.randint(2, 6))


def merge(posts):
    _sorted = map(lambda x: x['text'], sorted(posts, key=lambda x: (x['link_id'], int(x['created_utc']))))
    return ' '.join(_sorted)


def test_ping():
    headers = auth_headers(apikey, apisecret)
    print("PING URL:---------> " + ping_url)
Beispiel #44
0
import jsonlines

if __name__ == "__main__":

    if len(sys.argv) == 1:
        logging.error("Usage: {} /path/to/json/file".format(sys.argv[0]))
        exit(1)

    if len(sys.argv) == 3:
        jsonl_dir = sys.argv[2]
    else:
        jsonl_dir = "./"

    json_path = Path(sys.argv[1])

    json_files = json_path.glob("*.json") if json_path.is_dir() else [
        json_path
    ]

    for json_file in json_files:
        if json_file.is_file():
            jsonl_file = Path(jsonl_dir, json_file.name).with_suffix(".jsonl")
            with open(json_file) as f:
                l = [{k: v
                      for k, v in o.items() if v is not None}
                     for o in json.load(f)]
            with jsonlines.open(jsonl_file, mode='w') as writer:
                writer.write_all(l)
        else:
            logging.warning("Failed to open {}, skip".format(json_file))
Beispiel #45
0

def do_driver_lookups(articles, mediod_indices, drivers, num=5):
    drivertok = tokenize_texts(drivers)
    driver_jsonl = []
    for i, mdi in enumerate(mediod_indices):
        # pdb.set_trace()
        results = ranker.article2queries(word_tokenize(articles[mdi]),
                                         drivertok, num)
        results = [res['driver'] for res in results]
        driver_jsonl.append(get_driver_dict(results, i))
    return driver_jsonl


with open(data_labels_path, 'rb') as rpkl:
    data, labels = pickle.load(rpkl)

articles = []
with jsonlines.open(article_jsonlines_path, 'r') as jr:
    for line in jr:
        articles.append(line)
article_text = [ar['text'] for ar in articles]

with open(driverpath, 'r') as dr:
    drivers = [dr.strip().replace('\n', '') for dr in dr.readlines()]

mediod_indices = get_mediod_indices(data, labels)
driver_jsonl = do_driver_lookups(article_text, mediod_indices, drivers)

with jsonlines.open(outputpath, 'w') as jw:
    jw.write_all(driver_jsonl)