Beispiel #1
0
def load_dataset(sentences, tags, word_to_id, tag_to_id, lower=False):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - tag indexes
    """
    def f(x):
        return x.lower() if lower else x

    data = []
    for i in range(len(sentences)):
        str_words = sentences[i]
        words = [
            word_to_id[f(w) if f(w) in word_to_id else '<UNK>']
            for w in str_words
        ]
        caps = [cap_feature(w) for w in str_words]
        tag_ids = [tag_to_id[t] for t in tags[i]]
        data.append({
            'str_words': str_words,
            'words': words,
            'caps': caps,
            'tags': tag_ids,
            'pos': tag_ids,
        })
    return data
Beispiel #2
0
    def preprocess(self, path, draft):
        output = []
        stopwords = [' ', '\n', '\u3000', '\u202f', '\u2009']

        with open(path, 'r', encoding='utf-8') as f:
            
            with open(path, 'r', encoding = 'utf-8') as t:
                data = []
                for line in t:
                    data.append(json.loads(line))
                t.close()
            # pdb.set_trace()
            if draft:
                data[0]['data'] = data[0]['data'][:1]

            for topic in data[0]['data']:
                for paragraph in topic['paragraphs']:
                    context = paragraph['context']
                    tokens = word_tokenize(context)
                    for qa in paragraph['qas']:
                        qid = qa['id']
                        question = qa['question']
                        for ans in qa['answers']:
                            answer = ans['text']
                            s_idx = ans['answer_start']
                            e_idx = s_idx + len(answer)

                            l = 0
                            s_found = False
                            for i, t in enumerate(tokens):
                                while l < len(context):
                                    if context[l] in stopwords:
                                        l += 1
                                    else:
                                        break
                                if t[0] == '"' and context[l:l + 2] == '\'\'':
                                    t = '\'\'' + t[1:]
                                elif t == '"' and context[l:l + 2] == '\'\'':
                                    t = '\'\''

                                l += len(t)
                                if l > s_idx and s_found == False:
                                    s_idx = i
                                    s_found = True
                                if l >= e_idx:
                                    e_idx = i
                                    break

                            output.append(dict([('qid', qid),
                                                ('context', context),
                                                ('question', question),
                                                ('answer', answer),
                                                ('start_idx', s_idx),
                                                ('end_idx', e_idx)]))
                
        with open('{}l'.format(path), 'w', encoding='utf-8') as f:
            for line in output:
                json.dump(line, f)
                print('', file=f)
Beispiel #3
0
def process_file(f, encoding='utf8'):
    """初始化函数
    Args:
        f[str] : 原始数据文件
        encoding[str]   : 编码(默认utf8)
    """
    data = []
    tag = []
    for line in f:
        x = ''
        y = ''
        nr_flag = False
        nt_flag = False
        nr_word = ''
        nt_word = ''
        #ignore the ID
        for pair in line.strip().split()[1:]:
            word = pair.split('/')[0]
            #split sentence with token '。'
            if word == u'。' and len(x) > 0:
                lines.append(x+'\n')
                labels.append(y.strip()+'\n')
                x = ''
                y = ''
                continue
            #process nt words
            if pair.startswith('['):
                nt_flag = True
                nt_word = word[1:]
                continue
            if nt_flag:
                if not pair.endswith(']nt'):
                    nt_word += word
                    continue
            # process nr tag
        if pair.endswith('nr'):
                nr_word += word
            elif len(nr_word) > 0:
                x += nr_word
                y += ' B-PER'+' I-PER'*(len(nr_word)-1)
                nr_word = ''
            if pair.endswith('nt'):
                if pair.endswith(']nt'):
                    word = nt_word+word
                    nt_flag = False
                x += word
                y += ' B-ORG'+' I-ORG'*(len(word)-1)
            # process ns tag
            elif pair.endswith('ns'):
                x += word
                y += ' B-LOC'+' I-LOC'*(len(word)-1)
            else:
                x += word
                y += ' O'*(len(word))
        if len(x) > 0:
            data.append(x+'\n')
            tag.append(y.strip()+'\n')
    return data, tag
Beispiel #4
0
def get_test_data():
    val_data = pd.read_csv('./Dataset/test.csv')
    data = []
    for content, label in zip(val_data['content'], val_data['label']):
        data.append((content, label))
    random.shuffle(data)
    contents, True_labels = [],[]
    for content, label in data:
        contents.append(content)
        True_labels.append(label)
    return contents, True_labels
Beispiel #5
0
def json_to_csv(file, output_file):
    data = []
    with open(file) as f:
        for line in f:
            data.append(json.loads(line))
    df = pd.DataFrame.from_records(data)[[
        'sentence1', 'sentence2', 'gold_label'
    ]]
    df['gold_label'] = df['gold_label'].map(values_dict)
    df = df[df['gold_label'].isin([0, 1, 2])]
    df.to_csv(output_file, index=False)
Beispiel #6
0
def preprocess_data(input_list,
                    type="train",
                    letter_width=0.02,
                    hop_length=400):
    data = []
    for i in input_list:
        data.append(
            preprocess_data_single_entry(i,
                                         type=type,
                                         letter_width=letter_width,
                                         hop_length=hop_length))
    return data
Beispiel #7
0
def add_padding(data, max_length, index_to_pad, index_to_eos):
    '''
    增加padding

    :param data:
    :param max_length: 每行最大单词数
    :param index_to_pad: pad对应的索引号
    :param index_to_eos: 行结束符的索引号
    :return:
    '''
    data = data[:max_length - 1]  #减1是为了在最后添加eos这个token索引
    padding_len = (max_length - 1) - len(data)
    assert padding_len >= 0, 'padding_len = max_length - len(data)'
    data.extend([index_to_pad] * padding_len)
    data.append(index_to_eos)
    return data
Beispiel #8
0
def user_builder(df):
    '''Given the flattened dataframe, output a user-based collection of data.'''

    data = []
    user_ids = df.user_id.unique()

    for i in tnrange(len(user_ids)):
        user = user_ids[i]
        sub_df = df.loc[df['user_id'] == user].reset_index(drop=True)
        country = sub_df['country_id'][0]
        join_date = sub_df['joining_date'][0]
        sorted_df = sub_df.sort_values('unix_timestamp').reset_index(drop=True)
        history = sorted_df['cities'].tolist()
        timestamps = sorted_df['unix_timestamp'].tolist()
        history = [(timestamps[j], history[j]) for j in range(len(sorted_df))]
        data.append((user, country, join_date, history))

    return data
Beispiel #9
0
def data_process(src_path, tgt_path, src_tokenizer, trg_tokenizer, src_vocab,
                 trg_vocab):
    src_iter = iter(io.open(src_path, encoding="utf8"))
    trg_iter = iter(io.open(tgt_path, encoding="utf8"))
    data = []
    for raw_src, raw_trg in zip(src_iter, trg_iter):
        raw_src, raw_trg = preprocess(raw_src), preprocess(raw_trg)
        # print(f"src:{raw_src}, target: {raw_trg}")
        src_tensor = torch.tensor(
            [src_vocab[token] for token in src_tokenizer(raw_src)],
            dtype=torch.long)
        trg_tensor = torch.tensor(
            [trg_vocab[token] for token in trg_tokenizer(raw_trg)],
            dtype=torch.long)

        data.append((src_tensor, trg_tensor))
    # print(f"data: {data}")
    return data
Beispiel #10
0
def evaluate(model, data_iter, tipe, epoch, fold):

    model.eval()
    corrects, avg_loss = 0, 0

    data_iter.sort_key = lambda x: len(x.text)

    for batch in data_iter:

        text_numerical, target = batch.text, batch.label

        if args.cuda:
            text_numerical, target = text_numerical.cuda(), target.cuda()

        text_numerical.data.t_()
        target.data.sub_(1)

        forward = model(text_numerical)
        loss = F.cross_entropy(forward, target, size_average=False)

        avg_loss += loss.data[0]
        corrects += (torch.max(forward, 1)[1].view(
            target.size()).data == target.data).sum()

    size = len(data_iter.dataset)
    avg_loss = avg_loss / size
    accuracy = 100.0 * corrects / size

    # print('Avg Loss = {}'.format(avg_loss))
    output['fold_{}'.format(fold + 1)]['epoch_{}'.format(
        epoch)][tipe]['avg_loss'] = avg_loss.item()

    if tipe == 'testing':
        if args.cuda:
            data.append(accuracy.item())
        else:
            data.append(accuracy)

    return target.data, torch.max(forward, 1)[1].view(target.size()).data
Beispiel #11
0
    type=int,
    default=1,
    help='number of ngrams to be selected in each batch [default: 10]')
parser.add_argument('-threshold',
                    type=float,
                    default=0.9,
                    help='threshold for selecting ngram [default: 0.9]')
args = parser.parse_args()

# load up data
data = []
with open('/mnt/storage01/milliet/data/ngrams/clean-ngrams-score-9500.csv',
          'r') as csvfile:
    lines = csvfile.readlines()
    for line in lines:
        data.append(line.split('\sep'))

threshold = 0.9999
bests = list(elem for elem in data if float(elem[-1]) > threshold)

print("N-grams with score > " + str(threshold) + ": " + str(len(bests)))

if len(bests) > 0:
    args.summary_dir = os.path.join(
        args.save_dir,
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + "threshold_" +
        str(threshold) + "_model_dict_" + args.state_dict)
    writer = SummaryWriter(log_dir=args.summary_dir)

    x_data = list(
        re.sub("\s+", ",", elem[1][2:-2]).split(',') for elem in bests)
Beispiel #12
0
def augmentation(data):
    augmented = []
    aug_df = pd.DataFrame({'summary': [], 'rating': []})
    reps = []
    for index, game in data.iterrows():
        genre = game['rating']

        if (genre == 'Dislike'):
            s = 38
            tok = tokenize_word(game['summary'])
        elif (genre == 'Acclaim'):
            s = 14
            tok = tokenize_word(game['summary'])
        else:
            s = 3
            tok = tokenize_sent(game['summary'])

        # if(genre in ('Turn-based strategy (TBS)')):
        #     s = 120
        #     tok = tokenize_word(game['summary'])
        # elif(genre in ('Tactical')):
        #     s = 110
        #     tok = tokenize_word(game['summary'])
        # elif(genre in ('Quiz/Trivia')):
        #     s = 100
        #     tok = tokenize_word(game['summary'])
        # elif(genre in ('Music')):
        #     s = 45
        #     tok = tokenize_word(game['summary'])
        # elif(genre in ('Arcade')):
        #     s = 17
        #     tok = tokenize_word(game['summary'])
        # elif(genre in ("Hack and slash/Beat 'em up")):
        #     s = 21
        #     tok = tokenize_word(game['summary'])
        # elif(genre in ('Racing')):
        #     s = 20
        #     tok = tokenize_word(game['summary'])
        # elif(genre in ('Sport')):
        #     s = 23
        #     tok = tokenize_word(game['summary'])
        # elif(genre in ('Indie')):
        #     s = 18
        #     tok = tokenize_sent(game['summary'])
        # elif(genre in ('Real Time Strategy (RTS)')):
        #     s = 12
        #     tok = tokenize_sent(game['summary'])
        # elif(genre in ('Strategy')):
        #     s = 1
        #     tok = tokenize_sent(game['summary'])
        # elif(genre in ('Fighting')):
        #     s = 7
        #     tok = tokenize_sent(game['summary'])
        # elif(genre in ('Point-and-click')):
        #     s = 6
        #     tok = tokenize_sent(game['summary'])
        # elif(genre in ('Simulator')):
        #     s = 7
        #     tok = tokenize_sent(game['summary'])
        # elif(genre in ('Adventure')):
        #     s = 3
        #     tok = tokenize_sent(game['summary'])
        # elif(genre in ('Platform')):
        #     s = 2
        #     tok = tokenize_sent(game['summary'])
        # elif(genre in ('Puzzle')):
        #     s = 4
        #     tok = tokenize_sent(game['summary'])
        # else:
        #     s = 0
        #     tok = tokenize_sent(game['summary'])

        shuffled = [tok]

        #print(ng_rev)
        for i in range(s):
            #generate 11 new reviews
            shuffled.append(shuffle_tokenized(shuffled[-1]))
        for k in shuffled:
            '''create new review by joining the shuffled sentences'''
            s = ' '
            new_game = s.join(k)
            if new_game not in augmented:
                augmented.append(new_game)
                aug_df = aug_df.append(
                    pd.DataFrame({
                        'summary': [new_game],
                        'rating': [genre]
                    }))
            else:
                reps.append(new_game)
    return data.append(aug_df)