def combine_labeled_data():
    fname_neg = DATA_ROOT + "filtered/bdv2/baidu_rel.tsvwb"
    fname_pos = DATA_ROOT +  "filtered/bdv2/baidu_sim.tsvwb"
    fname_all = DATA_ROOT + "filtered/bdv2/baidu_qq.tsv"
    result = []
    dict = set()

    cont1 = util.readlines_from_file(fname_neg)
    idx, tot = 0, 0
    while idx < (len(cont1) - 1):
        vals = cont1[idx].split("\t")
        idx += 1
        if not len(vals) == 2:
            continue
        w1 = cleanup(vals[0])
        w2 = cleanup(vals[1])
        if not check_negative(w1, w2):
            continue
        dict.add(w1)
        dict.add(w2)
        result.append(w1 + "\t" + w2 + "\t0\n")
        tot += 1

    cont2 = util.readlines_from_file(fname_pos)
    idx = 0
    while idx < min(len(cont2), tot):  # not too many pos
        vals = cont2[idx].split("\t")
        idx += 1
        if not len(vals) == 2:
            continue

        w1 = cleanup(vals[0])
        w2 = cleanup(vals[1])
        idx += 1
        #if not w1 in dict and not w2 in dict:
        #    continue
        result.append(w1 + "\t" + w2+ "\t1\n")

    random.shuffle(result)

    with codecs.open(fname_all, "w", "gbk") as fw:
        for res in result:
            fw.write(res)

    idx = 0
    with codecs.open(fname_all+".new", "w", "gbk") as f1:
        while idx < 1*len(result):
            f1.write(result[idx])
            idx += 1

    '''with codecs.open(fname_all+".2", "w", "gbk") as f1:
Exemple #2
0
def read_data(fname):
    lines = util.readlines_from_file(fname)
    data = []
    for line in lines:
        vals = line.split(" ")
        data.append(vals[0].strip())
    return data
Exemple #3
0
    def load(self):
        """
        ファイル `crontab` を読み込む.
        """
        self._judges = []
        content = util.readlines_from_file(Crontab.FILENAME)

        for line in content:
            self._load_one_line(line)

        if len(self._judges)==0:
            log.info('The `crontab` file has no settings.')
def intercept():
    idx = data_vote()
    data = util.readlines_from_file(DATA_ROOT + "filtered/bdv2/baidu_qq.tsv.new")
    fw = open(DATA_ROOT + "filtered/bdv2/join.tsv", "w")
    for i in range(len(idx)):
        line = idx[i].strip().replace("__label__", "")
        if len(line) == 0:
            continue
        label = data[i].strip().split("\t")[2]
        if label == line:
            fw.write(data[i] +"\n")
    fw.close()
Exemple #5
0
def gen_label():
    fname1 = "C:\\Workspace\\Data\\wenda\\filtered\\bdv2\\odmodel_newdata\\baidu_qq.tsv.new"
    fname2 = "C:\\Workspace\\Data\\wenda\\filtered\\bdv2\\odmodel_newdata\\pred.tsv2"
    data1 = util.readlines_from_file(fname1)
    data2 = util.readlines_from_file(fname2)
    c1, c2 = [], []
    for d1 in data1:
        vals = d1.split("\t")
        if not len(vals) == 3:
            continue
        c1.append(vals[2])

    for d2 in data2:
        vals = d2.split("\t")
        if not len(vals) == 2:
            continue
        c2.append(vals[1])
    if not len(c1) == len(c2):
        print "wrong"
    with open(fname1 + ".label", "w") as fw:
        for i in range(len(c1)):
            fw.write(c1[i] + "\t" + c2[i] + "\n")
def iconv(f1, f2, enfrom="utf-8", ento="gbk"):
    with open(f2, "w") as fw:
        lines= util.readlines_from_file(f1)
        for line in lines:
            #line = line.strip().replace(" ", ",").replace("\n", ",") + "\n"
            line = re.sub(",+", ",", line)
            #if not "\t" in line or len(line) == 1 or line.count("\t") > 1:
            #    continue
            try:
                s = change_encoding(line, enfrom, ento)
            except Exception as e:
                print(e.message, line)
                continue
            fw.write(s + "\n")
Exemple #7
0
def get_qq_es(fname):
    data = util.readlines_from_file(fname)
    with open(fname + ".qq", "w") as fw:
        q1, q2 = "", ""
        for line in data:
            line = line.strip()
            if line.startswith("input:"):
                q1 = line[len("input:"):]
            else:
                if len(line) == 0:
                    continue
                idx = line.rfind(" ")
                q2 = line[0:idx]
                fw.write(q1 + "\t" + q2 + "\t-1\n")
Exemple #8
0
def proc_score():
    fname = "C:\\Workspace\\Data\\wenda\\filtered\\bdv2\\odmodel_newdata\\pred.tsv"
    data = util.readlines_from_file(fname)
    with open(fname + "2", "w") as fw:
        for line in data:
            vals = line.split("\t")
            if len(vals) <= 1:
                continue
            lab = int(vals[0])
            if lab == 1:
                fw.write(line + "\n")
            else:
                s1 = (float)(vals[1])
                s2 = 1 - s1
                fw.write("0\t" + str(s2) + "\n")
def gen_questions():
    content = util.readlines_from_file(DATA_ROOT + "/qclick/wenda_q2q_v2.dat")
    dedup = set()
    with open(DATA_ROOT + "/qclick/wenda_questions.txt", "w") as fw:
        for line in content:
            vals = line.split("\t")
            if not len(vals) == 2:
                continue
            if len(vals[1]) <= wenda_proc.MIN_COUNT_Q or len(vals[1]) >= wenda_proc.MAX_COUNT_Q:
                continue
            if vals[1] in dedup:
                #print("dup " + vals[1])
                continue
            dedup.add(vals[1])
            fw.write(vals[1] + "\n")