Ejemplo n.º 1
0
def generate_ner(args) -> None:
    """
    总共分成2个步骤:
        Step1 : 用模型进行实体识别
        Step2 : 对每篇文章按照中文句点进行分割
    Args:
        args:
            --file_root : root path of data
    """

    file_names = scan_files(args.file_root)  # type:List[str]
    for file in file_names:
        data = load_file(args.file_root, file, "txt")

        # Part1 : 计算得到当前文章的实体识别结果
        prepare_data = prepare(data)  # type:np.ndarray
        result = predict(prepare_data)  # type:np.ndarray
        _, ner_result = decode_result(result=result,
                                      sent_pre=prepare_data,
                                      sent=data)

        pickle.dump(ner_result, open(args.file_root + file + "_ner.pkl", 'wb'))

        # Part2 : 将当前文章按照(句号/问号/感叹号)作为划分,并记录到dict中
        start, end = 0, 0
        sentence_split_result = []
        stop_tokens = ["。", "!", "?"]
        for idx, c in enumerate(data):
            if c in stop_tokens:
                end = idx
                sentence_split_result.append((start, end))
                start = end + 1

        pickle.dump(sentence_split_result,
                    open(args.file_root + file + "_sentence_split.pkl", 'wb'))
def make_predictions(pairs, encoder, decoder, char2i, outputfn,\
                     batch_size=100, use_cuda=True):
    i2char = {c: i for i, c in char2i.items()}
    batches = get_batches(pairs, batch_size, char2i,\
                          PAD_symbol, use_cuda, test_mode=True)
    output_strings = []
    out = open(outputfn, "w")
    for batch in batches:
        preds = predict(encoder, decoder, batch,\
                    list(char2i.keys()), use_cuda)

        for j in range(batch.size):
            eos = (preds[:, j] == EOS_index).\
                  nonzero().data[1][0]
            # Write the prediction up to the
            # second eos
            #print(''.join([i2char[int(c)] for c in\
            #               preds[1:eos, j]]))
            input_text = batch.inputs[j]
            input_enc = [i2char[int(c)] for c in \
                         batch.input_variable.t()[j]]
            pred = ''.join([i2char[int(c)] for c in\
                     preds[1:eos, j]])
            pred = replace_UNK(input_text, input_enc, pred)

            out.write(pred)
            out.write("\n")
Ejemplo n.º 3
0
def test(d, tgt_domain):
    # print('Trn Size', d.trn_X.shape, d.trn_y.shape, 'Test Size', d.test_X.shape, d.test_y.shape)

    alphas = [0.001, 0.1, 1, 10, 100, 1000]
    # alphas = [1]
    models = [Ridge(alpha=a) for a in alphas]

    trn_mses, dev_mses, test_mses = [], [], []
    for i, model in enumerate(models):
        trn_mse, dev_mse, test_mse = evaluate.predict(model, tgt_domain, d.trn_X, d.trn_y, d.dev_X, d.dev_y, d.test_X,
                                                      d.test_y)
        trn_mses.append(trn_mse)
        dev_mses.append(dev_mse)
        test_mses.append(test_mse)
    return trn_mses, dev_mses, test_mses, models
Ejemplo n.º 4
0
def scrapeBets():
    matchDF_filename = './csv/static/matchDF.csv'
    test = False
    if len(sys.argv) > 1:
        if sys.argv[1] == 'test':
            test = True
        else:
            url = sys.argv[1]
            search_regex = '.offering-games__table-row'
            time_regex = '.offering-games__link'

    def processGamesPlaying(driver):
        url = 'https://www.flashscore.com/table-tennis/'

        def doSomething(driver):
            button = driver.find_element_by_xpath(
                '//*[@id="live-table"]/div[1]/div/div[2]/div[1]')
            button.click()

        page_source, driver = doSomethingFetchPageSource(
            url, driver=driver, doSomething=[doSomething])
        text_file = open("./temp/gamesPlaying.txt", "w")
        text_file.write(page_source)
        text_file.close()

        search_regex = '.event__participant'
        s = BeautifulSoup(str(page_source), 'html.parser')
        players = s.select(search_regex)
        k = []
        for (index, player) in enumerate(players):
            p = player.text
            p = p[:p.find('(')].strip()
            k.append(p)
        return k

    cols = ['time', 'lTeam', 'rTeam', 'lLine', 'rLine', 'link']

    def processBetOnline(driver):
        url = 'https://beta.betonline.ag/sportsbook/table-tennis/todaygames'
        base = 'https://beta.betonline.ag'
        page_source = fetchPS(url, test, driver)

        text_file = open("./temp/betonline.txt", "w")
        text_file.write(page_source)
        text_file.close()

        def formatTeamNames(teams):
            def formatSide(side):
                end = side.find(',')
                return (side[:end] + ' ' + side[end + 2:end + 3] + '.').strip()

            return [formatSide(teams[0].text), formatSide(teams[1].text)]

        s = BeautifulSoup(str(page_source), 'html.parser')
        df = pd.DataFrame([], columns=cols)
        search_regex = '.offering-today-games__table-row'
        time_regex = '.offering-today-games__link'
        matches = s.select(search_regex)
        for (index, match) in enumerate(matches):
            teams = match.select('.lines-row__team-name')
            lines = match.select('.lines-row__money')
            time = match.select(time_regex)
            teams = formatTeamNames(teams)
            link = base + time[0].get('href')
            k = pd.DataFrame([[
                time[0].text, teams[0], teams[1], lines[0].text.strip('()'),
                lines[1].text.strip('()'), link
            ]],
                             columns=cols)
            df = df.append(k)
        df['platform'] = url
        return df.reset_index(0, drop=True)

    def processBovada(driver):
        url = 'https://www.bovada.lv/sports/table-tennis'
        base = 'https://www.bovada.lv'
        page_source = fetchPS(url,
                              test,
                              driver,
                              waitFor=['class', 'grouped-events'])
        text_file = open("./temp/bovada_ps.txt", "w")
        text_file.write(page_source)
        text_file.close()

        def formatLines(lines):
            def formatLine(line):
                l = line.strip()
                if l == 'EVEN':
                    return '+100'
                return l

            if len(lines) > 2:
                return [formatLine(lines[2].text), formatLine(lines[3].text)]
            return [formatLine(lines[0].text), formatLine(lines[1].text)]

        def formatTeamNames(teams):
            def formatSide(side):
                end = side.find(',')
                if end == -1:
                    end = side.find(' ')
                    return (side[end:] + ' ' + side[0:1] + '.').strip()
                return (side[:end] + ' ' + side[end + 2:end + 3] + '.').strip()

            return [formatSide(teams[0].text), formatSide(teams[1].text)]

        def formatTime(time):
            text = time[0].text
            e = text.find(' ', 2)
            return text[e + 1:]

        s = BeautifulSoup(str(page_source), 'html.parser')
        df = pd.DataFrame([], columns=cols)
        search_regex = '.coupon-content.more-info'
        time_regex = '.period'
        s = s.select('.next-events-bucket')
        if len(s) == 0:
            return df
        s = s[0]
        matches = s.select(search_regex)
        for (index, match) in enumerate(matches):
            teams = match.select('.competitor-name')
            link = match.select('.game-view-cta')
            link = link[0].find_all('a', href=True)[0].get('href')
            link = base + link
            lines = match.select('.bet-price')
            time = match.select(time_regex)
            lines = formatLines(lines)
            teams = formatTeamNames(teams)
            time = formatTime(time)
            k = pd.DataFrame([[
                time, teams[0], teams[1], lines[0].strip('()'),
                lines[1].strip('()'), link
            ]],
                             columns=cols)
            df = df.append(k)
        df['platform'] = url
        return df.reset_index(0, drop=True)

    def findCorresponding(df, l, r):
        # print(l, r)
        today = date.today()
        d = today.strftime("%d.%m")
        k = df.loc[((df['lPlayer'] == r.strip()) &
                    (df['rPlayer'] == l.strip())) |
                   ((df['lPlayer'] == l.strip()) &
                    (df['rPlayer'] == r.strip()))]
        return k

    def getCorrespondingGames(df):
        gameDF = pd.read_csv(matchDF_filename)
        gameDF = gameDF.loc[gameDF['lScore'] == '-']
        d = pd.DataFrame()
        o = pd.DataFrame()
        for index, i in df.iterrows():
            k = findCorresponding(gameDF, i['lTeam'], i['rTeam'])
            if k.shape[0] != 0:
                d = d.append(k.iloc[0])
                i['merge_index'] = k.id.values[0]
                o = o.append(i)
        return d, o

    def fetchPS(url, test, driver, **kwargs):
        ps = ''
        if test == False:
            if 'waitFor' in kwargs:
                ps, driver = fetchPageSource(url,
                                             waitFor=kwargs['waitFor'],
                                             driver=driver)
            else:
                ps, driver = fetchPageSource(url, driver=driver)
            text_file = open("./Debug/ps.txt", "w")
            text_file.write(ps)
            text_file.close()
        else:
            file = open('./Debug/ps.txt', 'r')
            ps = file.read()
            file.close()
        return ps

    driver = createDriver()
    gamesPlaying = processGamesPlaying(driver)
    k = processBetOnline(driver)
    l = processBovada(driver)
    k.to_csv('./temp/betOnline.csv')
    l.to_csv('./temp/bovada.csv')
    bettingSitesDF = [k, l]
    df = pd.concat(bettingSitesDF)

    def formatLines(df):
        df.loc[df['rTeam'].str.strip() > df['lTeam'].str.strip(),
               ['lTeam', 'rTeam', 'lLine', 'rLine']] = df.loc[
                   df['rTeam'].str.strip() > df['lTeam'].str.strip()][[
                       'rTeam', 'lTeam', 'rLine', 'lLine'
                   ]].values
        df = df.sort_values('time')
        df.to_csv('./temp/combined.csv')
        return df

    df.to_csv('./temp/bettingSitesDF.csv')
    df = formatLines(df).reset_index()
    cdf, cbdf = getCorrespondingGames(df)
    print(cdf.to_csv('./temp/cdf.csv'))
    print(cbdf.to_csv('./temp/cbdf.csv'))

    ul = set(list(cdf['lPlayer'].unique()) + list(cdf['rPlayer'].unique()))
    mdf = pd.read_csv(matchDF_filename)
    udf = mdf[(mdf['lPlayer'].isin(ul)) | (mdf['rPlayer'].isin(ul))]

    udf.to_csv('./test_before.csv')
    formatted = formatter(udf, True, ignore_ids=cdf['id'])

    formatted = formatted[formatted['id'].isin(cdf['id'])]
    formatted = formatted.merge(mdf, on='id')

    def formatSequencer(df, seq):
        df.loc[df[seq].str.contains(' ') == False, seq] = df[seq] + '0000'
        df.loc[df[seq].str.contains(' '),
               seq] = df[seq].str[0:6] + '2020' + df[seq].str[-5:]
        df[seq] = df[seq].str.replace('.', '')
        df[seq] = df[seq].str.replace(':', '')
        df[seq] = df[seq].str[4:8] + df[seq].str[2:4] + df[seq].str[0:2] + df[
            seq].str[8:]
        k = df[df[seq].str.contains(' ')][[seq]]
        df[seq] = df[seq].astype(int)
        return df

    formatted = formatSequencer(formatted, 'datetime')
    formatted.to_csv('./temp/merged.csv')

    predictions = predict(formatted)
    formatted['predictions'] = predictions.tolist()
    formatted['rWinPred'] = formatted['predictions'].apply(lambda x: x[0])
    formatted['lWinPred'] = formatted['predictions'].apply(lambda x: x[1])

    formatted = formatted.merge(cbdf, left_on='id', right_on='merge_index')

    formatted.to_csv('./temp/formatted.csv')

    formatted = formatted[formatted['lLine'] != '']
    formatted = formatted[formatted['rLine'] != '']

    formatted = swap(formatted, ['lTeam', 'Player_left'],
                     [['lTeam', 'rTeam'], ['lLine', 'rLine']])

    formatted['lOdds'] = formatted['lLine'].astype(int).apply(
        americanToImplied)
    formatted['rOdds'] = formatted['rLine'].astype(int).apply(
        americanToImplied)

    formatted['ledge'] = round(formatted['lWinPred'] - formatted['lOdds'], 4)
    formatted['redge'] = round(formatted['rWinPred'] - formatted['rOdds'], 4)

    formatted['lOdds'] = round(formatted['lOdds'], 4)
    formatted['rOdds'] = round(formatted['rOdds'], 4)

    formatted['lWinPred'] = round(formatted['lWinPred'], 4)
    formatted['rWinPred'] = round(formatted['rWinPred'], 4)

    formatted = formatted.sort_values('datetime')
    formatted = getLargestInGroup(formatted, ['id'], 'ledge', 'redge')
    formatted = formatted.sort_values('datetime')
    formatted = filterOnlyNew(formatted, gamesPlaying, 'datetime')
    formatted = formatted.sort_values('datetime')

    cols = [
        'datetime', 'id', 'lTeam', 'rTeam', 'Player_left', 'Player_right',
        'lWinPred', 'rWinPred', 'lOdds', 'rOdds', 'lLine', 'rLine', 'ledge',
        'redge', 'platform', 'link'
    ]
    formatted[cols].to_csv('./predictions.csv')
    print("done")
Ejemplo n.º 5
0
                     log_dir=args.logdir,
                     verbose=args.verbose)

    ###############################################
    ##                 Predict                   ##
    ###############################################
    if not args.test:
        load_model(args.modeldir, cnn)
    else:
        logger.info('Testing on val set:')
        val_acc = test(cnn,
                       val_iter,
                       text_field,
                       label_field,
                       cuda=args.cuda,
                       verbose=args.verbose)
    predict(cnn,
            val_iter,
            text_field,
            label_field,
            os.path.join(args.predout, 'predict_val.txt'),
            cuda=args.cuda,
            verbose=args.verbose)
    predict(cnn,
            test_iter,
            text_field,
            label_field,
            os.path.join(args.predout, 'predict_test.txt'),
            cuda=args.cuda,
            verbose=args.verbose)
Ejemplo n.º 6
0
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test : {X_test.shape}\n")

print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test : {y_test.shape}\n")

print(f"Ratio: {len(X_train)/len(X_test)}")

# Training
print("Start training...")
model = train(X_train, y_train, model_path)
print("Training done!!")

#  Evaluate
## Acc
print("Train Acc")
get_acc(model, X_train, y_train)

print("\nTest Acc")
get_acc(model, X_test, y_test)

## Print result
print_report(model, X_test, y_test, le.classes_, report_path)

## Caculate Confidence
caculate_confidence(model, X_test, y_test)

## Predict
text = 'who are the actresses in the movies'
predict(model, vectorizer, le, text)
Ejemplo n.º 7
0
import visualize
import evaluate

if __name__ == '__main__':
    train_dataset, test_dataset, encoder = utilities.load_data()

    model = tf.keras.Sequential([
        encoder,
        tf.keras.layers.Embedding(
            input_dim=len(encoder.get_vocabulary()),
            output_dim=64,
            mask_zero=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=['accuracy'])

    history = model.fit(train_dataset, epochs=1,
                        validation_data=test_dataset,
                        validation_steps=30)

    test_loss, test_acc = model.evaluate(test_dataset)

    visualize.display_results(test_loss, test_acc, history)

    evaluate.predict(model)
Ejemplo n.º 8
0
def main():
    # 数据预处理
    # dataList = []
    # for i in range(3,11):
    #     dataList.append(PrepareData())
    #     args.train_file = '/Users/wangyihao/Pycharm/transformer-simple-master_new/data/data' + str(i) + '.p'
    data = PrepareData()
    args.src_vocab = len(data.en_word_dict)
    args.tgt_vocab = len(data.cn_word_dict)
    print("src_vocab %d" % args.src_vocab)
    print("tgt_vocab %d" % args.tgt_vocab)

    # 初始化模型
    model = make_model(args.src_vocab, args.tgt_vocab, args.layers,
                       args.d_model, args.d_ff, args.h_num, args.dropout)

    if args.type == 'train':
        # 训练
        print(">>>>>>> start train")
        criterion = LabelSmoothing(args.tgt_vocab,
                                   padding_idx=0,
                                   smoothing=0.0)
        optimizer = NoamOpt(
            args.d_model, 1, 2000,
            torch.optim.Adam(model.parameters(),
                             lr=0,
                             betas=(0.9, 0.98),
                             eps=1e-9))
        train(data, model, criterion, optimizer)
        print("<<<<<<< finished train")
    elif args.type == "evaluate":  # 预测
        # 先判断模型有没有训练好(前提)
        if os.path.exists(args.save_file):
            # 加载模型
            model.load_state_dict(torch.load(args.save_file))
            # 开始预测
            print(">>>>>>> start evaluate")
            precision = evaluate(data, model)
            TP_total = precision.sum(axis=0)[0]
            FP_total = precision.sum(axis=0)[1]
            TN_total = precision.sum(axis=0)[2]
            FN_total = precision.sum(axis=0)[3]
            TPR = TP_total / (TP_total + FN_total)  #计算真正率
            TNR = TN_total / (TN_total + FP_total)  #计算真负率
            print(
                'total true positive amount: %.3f, total false negative amount: %.3f'
                % (TP_total, FN_total))
            print(
                'total true negative amount: %.3f, total false positive amount: %.3f'
                % (TN_total, FP_total))
            print('symbol within feature TPR: %.3f, delimiter TNR: %.3f' %
                  (TPR, TNR))
            print("<<<<<<< finished evaluate")
        else:
            print("Error: pleas train before evaluate")
    elif args.type == "predict":  #输入特征并预测
        if os.path.exists(args.save_file):
            # 加载模型
            model.load_state_dict(torch.load(args.save_file))
            # 开始预测
            print(">>>>>>> start predict")
            translation = predict(data, model)
            print("<<<<<<< finished predict")
    else:
        print("Error: please select type within [train / evaluate / predict]")
 def predict(self, image_arr, x_test, t1_image, out_file):
     y_pred = evaluate.predict(self.model, image_arr, x_test)
     save_mask(t1_image, x_test, y_pred, out_file)