コード例 #1
0
def main(data_type="ebola_stem", argv=sys.argv):
    base_dir = "../../datas/"
    file_count = du.ebola_file_count
    out_dir = base_dir + "LangModel/tmp/" + data_type + "_{}.m.json"
    in_dir = base_dir + data_type + "_json/{}.json"
    if data_type.startswith("ny"):
        in_dir = base_dir + data_type + "_json/{:07d}.json"
        file_count = du.ny_file_count

    # out_dir = "../../datas/LangModel/ebola_stem_{}.m.json"
    # in_dir = "../../datas/ebola_stem_json/{}.json"
    test = mp.partial(deal_thread,
                      0,
                      1,
                      in_dir=in_dir,
                      out_dir=out_dir,
                      file_count=200)
    mp.multi_main(target=deal_thread,
                  test_target=test,
                  use_pool=True,
                  argv=argv,
                  in_dir=in_dir,
                  out_dir=out_dir,
                  file_count=file_count)
    logging.info("[#] multi_merge all done")
コード例 #2
0
ファイル: use_se.py プロジェクト: hikean/TREC2017-DD-ICTNET
def main(iter_count, return_count, likehood, in_dir, out_dir, dtype, se_name):
    mp.multi_main(target=thread_main,
                  test_target=mp.partial(thread_main,
                                         process_id=0,
                                         process_count=1,
                                         in_dir=in_dir,
                                         out_dir=out_dir,
                                         dtype=dtype,
                                         se_name=se_name,
                                         iter_count=iter_count,
                                         return_count=return_count,
                                         likehood=likehood,
                                         test=True),
                  use_pool=True,
                  in_dir=in_dir,
                  out_dir=out_dir,
                  dtype=dtype,
                  se_name=se_name,
                  iter_count=iter_count,
                  return_count=return_count,
                  likehood=likehood,
                  test=False)
    if "process" in sys.argv:
        process_count = int(sys.argv[-1])
        rejudege(process_count,
                 iter_counts=[1, 2, 3, 5, 10],
                 max_iter_count=iter_count,
                 out_dir=out_dir,
                 dtype=dtype)

    print "\nDone!"
コード例 #3
0
def main():
    in_dir = '/home/zhangwm/trec/datas/ebola_full/{}.json'
    out_dir = '/home/zhangwm/trec/datas/merged_fields_ebola/{}.json'
    dtype, file_count = "ebola", du.ebola_file_count

    mp.multi_main(target=merge_thread,
                  test_target=mp.partial(merge_thread, 0, 1, in_dir, out_dir,
                                         dtype, 200, True),
                  use_pool=True,
                  in_dir=in_dir,
                  out_dir=out_dir,
                  dtype=dtype,
                  file_count=file_count,
                  overwrite=True)
コード例 #4
0
def main():
    logging.root.setLevel(logging.WARNING)
    argsv = sys.argv
    if "merge" in argsv[1]:
        options = argsv[1]
        process_count = int(argsv.pop())
        merge_results(options, process_count)
    else:
        options = argsv.pop()
        mp.multi_main(target=filter_thread,
                      test_target=mp.partial(filter_thread, 0, 1,
                                             options + "test"),
                      argv=argsv,
                      options=options,
                      use_pool=True)
コード例 #5
0
ファイル: eb_full.py プロジェクト: hikean/TREC2017-DD-ICTNET
def main(argv=sys.argv):
    logging.root.setLevel(logging.INFO)
    in_dir = "../../datas/ebola/{}.json"
    out_dir = "../../datas/ebola_full/{}.json"
    mp.multi_main(
        target=deal_thread,
        test_target=mp.partial(
            deal_thread, 0, 1, in_dir, out_dir
        ),
        use_pool=True,
        argv=argv,
        in_dir=in_dir,
        out_dir=out_dir,
        file_count=194481
    )
コード例 #6
0
def main():
    in_dir = "../../datas/ny_json/{:07d}.json"
    out_dir = "../../datas/ny_words/{:07d}.txt"
    json_dir = "../../datas/ny_words_json/{:07d}.json"
    stem_dir = "../../datas/ny_stem/{:07d}.json"
    stem_jsdir = "../../datas/ny_stem_json/{:07d}.json"
    test = mp.partial(deal_thread,
                      0,
                      1,
                      in_dir=in_dir,
                      out_dir=out_dir,
                      json_dir=json_dir,
                      stem_dir=stem_dir,
                      stem_jsdir=stem_jsdir,
                      file_count=200)
    mp.multi_main(target=deal_thread,
                  test_target=test,
                  in_dir=in_dir,
                  out_dir=out_dir,
                  json_dir=json_dir,
                  stem_dir=stem_dir,
                  stem_jsdir=stem_jsdir)
コード例 #7
0
def main():
    out_base_dir = "../../datas/"
    in_dir = "../../datas/ebola/{}.json"
    out_dir = out_base_dir + "ebola_words/{}.txt"
    json_dir = out_base_dir + "ebola_words_json/{}.json"
    stem_dir = out_base_dir + "ebola_stem/{}.txt"
    stem_jsdir = out_base_dir + "ebola_stem_json/{}.json"
    test = mp.partial(deal_thread,
                      0,
                      1,
                      in_dir=in_dir,
                      out_dir=out_dir,
                      json_dir=json_dir,
                      stem_dir=stem_dir,
                      stem_jsdir=stem_jsdir,
                      file_count=200)
    mp.multi_main(target=deal_thread,
                  test_target=test,
                  in_dir=in_dir,
                  out_dir=out_dir,
                  json_dir=json_dir,
                  stem_dir=stem_dir,
                  stem_jsdir=stem_jsdir)
コード例 #8
0

def index_ebola(es, file_id):
    global FILE_TEMPLATE
    file_name = FILE_TEMPLATE.format(file_id)
    es.index(index="trec",
             body=deal_json(file_name),
             doc_type="nytimes",
             id=file_id)


def index_thread(thread_id, thread_count, file_count=1855658):
    es = Elasticsearch(ELASTICS_HOSTS)
    file_id = thread_id
    while file_id <= file_count:
        try:
            index_ebola(es, file_id)
            if file_id % 101 == thread_id:
                logging.warning("[#] processing file %s.json", file_id)
        except Exception as e:
            logging.exception("[!] index nytimes exception: %s", e)
        file_id += thread_count


if __name__ == "__main__":
    mp.multi_main(target=index_thread,
                  test_target=mp.partial(index_thread,
                                         thread_id=0,
                                         thread_count=1,
                                         file_count=281))
コード例 #9
0
    if exists(out_file_name):
        return
    js = json.load(codecs.open(in_file_name, "r"))
    js["content"] = parse_html(js["content"])
    js["url"] = unquote(js["url"])
    with codecs.open(out_file_name, "w", "utf-8") as fl:
        fl.write(json.dumps(js))


def ebola_thread(thread_id, thread_count, in_dir, out_dir, file_count=194481):
    for i in range(file_count / thread_count + 2):
        file_id = i + thread_id * file_count / thread_count
        try:
            ebola(in_dir.format(file_id), out_dir.format(file_id))
        except Exception as e:
            print e, file_id


if __name__ == "__main__":
    in_dir = "../datas/ebola/{}.json"
    out_dir = "../datas/ebola_clean/{}.json"
    mp.multi_main(target=ebola_thread,
                  test_target=mp.partial(ebola_thread,
                                         0,
                                         1,
                                         in_dir,
                                         out_dir,
                                         file_count=1000),
                  in_dir=in_dir,
                  out_dir=out_dir)
コード例 #10
0
    file_id = thread_id
    while file_id < file_count:
        try:
            js = json.load(codecs.open(in_dir.format(file_id), "r", "utf-8"))
        except Exception as e:
            logging.exception(
                "[!] <{}> <{}>: {}".format(thread_id, file_id, e)
            )

        words = deal_news(js)
        out_file.write(str(file_id))
        out_file.write(" ")
        out_file.write(",".join(words))
        out_file.write("\n")
        file_id += thread_count
    out_file.close()


if __name__ == "__main__":
    in_dir = "../datas/ny_json/{:07d}.json"
    out_dir = "../datas/ny_words/ny_words_{}.txt"
    mp.multi_main(
        target=deal_thread,
        test_target=mp.partial(
            deal_thread, 0, 1, in_dir=in_dir, out_dir=out_dir, file_count=200
        ),
        in_dir=in_dir,
        out_dir=out_dir,
        file_count=1855658
    )
コード例 #11
0
def deal_thread(thread_id,
                thread_count,
                file_tmplt="../datas/ny_json/{:07d}.json",
                nonested_file_tmplt="../datas/nonested/{:07d}.json",
                merged_file_tmplt="../datas/merged/{:07d}.json",
                file_count=1855658):
    # key_set = set()
    # key_file_name = "{}.json".format(thread_id)
    while thread_id < file_count:
        in_file = file_tmplt.format(thread_id)
        nonested_file = (None if nonested_file_tmplt is None else
                         nonested_file_tmplt.format(thread_id))
        merged_file = (None if merged_file_tmplt is None else
                       merged_file_tmplt.format(thread_id))
        try:
            deal_files(in_file, nonested_file, merged_file)
        except Exception as e:
            logging.exception("deal thread %s", e)
        thread_id += thread_count
        # key_set.update(statics_key(nonested_file))
    # json.dump(list(key_set), codecs.open(key_file_name, "w", "utf-8"))


if __name__ == "__main__":
    mp.multi_main(target=deal_thread,
                  test_target=mp.partial(deal_thread,
                                         thread_id=0,
                                         thread_count=1,
                                         file_count=200))
コード例 #12
0
def ebola(in_file_name, out_file_name):
    if exists(out_file_name):
        return
    js = json.load(codecs.open(in_file_name, "r"))
    js["content"] = parse_html(js["content"])
    js["url"] = unquote(js["url"])
    with codecs.open(out_file_name, "w", "utf-8") as fl:
        fl.write(json.dumps(js))


def ebola_thread(thread_id, thread_count, in_dir, out_dir, file_count=194481):
    file_id = thread_id
    while file_id < file_count:
        ebola(in_dir.format(file_id), out_dir.format(file_id))
        file_id += thread_count


if __name__ == "__main__":
    in_dir = "../datas/ebola/{}.json"
    out_dir = "../datas/ebola_json/{}.json"
    mp.multi_main(target=ebola_thread,
                  test_target=mp.partial(ebola_thread,
                                         thread_id=0,
                                         thread_count=1,
                                         in_dir=in_dir,
                                         out_dir=out_dir,
                                         file_count=200),
                  in_dir=in_dir,
                  out_dir=out_dir)