def main(argc, argv): if argc < 6: print( 'Usage:%s <data> <char_vob> <train_output> <test_output> <output_type>' % (argv[0])) exit(1) char_vob = w2v.Word2vecVocab() char_vob.Load(argv[2]) word_vob = w2v.Word2vecVocab() word_vob.Load('words_vec_100.txt') train_out = open(argv[3], 'w') test_out = open(argv[4], 'w') with open(argv[1], 'r') as f: csv_reader = csv.reader(f, delimiter=',') data = [row for row in csv_reader] stat_max_len(data) train_data, test_data = build_dataset(data) processLine(train_out, argv[5], train_data, char_vob, word_vob) processLine(test_out, argv[5], test_data, char_vob, word_vob) train_out.close() test_out.close()
def main(argc, argv): global totalLine global longLine global totalChars if argc < 6: print("Usage:%s <word_vob> <char_vob> <pos_vob> <dir> <output>" % (argv[0])) sys.exit(1) wvobPath = argv[1] cvobpath = argv[2] pvobPath = argv[3] rootDir = argv[4] word_vob = w2v.Word2vecVocab() word_vob.Load(wvobPath) char_vob = w2v.Word2vecVocab() char_vob.Load(cvobpath) posVob = {} loadPosVob(pvobPath, posVob) out = open(argv[5], "w") for dirName, subdirList, fileList in os.walk(rootDir): curDir = os.path.join(rootDir, dirName) for file in fileList: if file.endswith(".txt"): curFile = os.path.join(curDir, file) #print("processing:%s" % (curFile)) fp = open(curFile, "r") for line in fp.readlines(): line = line.strip() processLine(line, out, word_vob, char_vob, posVob) fp.close() out.close() print("total:%d, long lines:%d, chars:%d" % (totalLine, longLine, totalChars))
def main(argc, argv): global totalLine global longLine global totalChars if argc < 5: print("Usage:%s <vec_vob> <tag_vob> <corpus> <output>" % (argv[0])) sys.exit(1) # wvobPath = argv[1] cvobpath = argv[1] pvobPath = argv[2] corpusPath = argv[3] vec_vob = w2v.Word2vecVocab() vec_vob.Load(cvobpath) tagVob = {} loadtagVob(pvobPath, tagVob) out = open(argv[4], "w") with open(corpusPath, 'r') as fp: all_text = fp.readlines() file_len = len(all_text) for count, line in enumerate(all_text): line = line.strip() if count % 1000 == 0: print(count, file_len) processLine(line, out, vec_vob, tagVob) out.close() print("total:%d, long lines:%d, chars:%d" % (totalLine, longLine, totalChars)) split_train_testing(argv[4])
def main(argc, argv): if argc < 3: print("Usage:%s <word2vec_vocab_path> <output_path>" % (argv[0])) sys.exit(1) vob = w2v.Word2vecVocab() vob.Load(argv[1]) vob.DumpBasicVocab(argv[2])
def main(argc, argv): global totalLine global longLine global totalChars if argc < 4: print("Usage:%s <vob> <dir> <output>" % (argv[0])) sys.exit(1) vobPath = argv[1] rootDir = argv[2] vob = w2v.Word2vecVocab() vob.Load(vobPath) out = open(argv[3], "w") for dirName, subdirList, fileList in os.walk(rootDir): for file in fileList: if file.endswith(".txt"): curFile = os.path.join(dirName, file) #print("processing:%s" % (curFile)) fp = open(curFile, "r") for line in fp.readlines(): line = line.strip() processLine(line, out, vob) fp.close() out.close() print("total:%d, long lines:%d, chars:%d" % (totalLine, longLine, totalChars))
def main(argc, argv): if argc < 4: print("Usage: %s <input> <output> <vec>" % (argv[0])) sys.exit(1) vob = w2v.Word2vecVocab() vob.Load(argv[3]) inp = open(argv[1], "r") oup = open(argv[2], "w") processFile(inp, oup, vob)
def main(argc, argv): if argc < 5: print('Usage:%s <data_dir> <char_vob> <output_type> <output_name>' % (argv[0])) exit(1) char_vob = w2v.Word2vecVocab() char_vob.Load(argv[2]) process_data(argv[1], char_vob, int(argv[3]), argv[4])
def main(argc, argv): if argc != 6: print('Usage:{} <vec_vob> <char_vob> <tag_vob> <corpus> <output>'.format(argv[0])) sys.exit(1) wvobPath = argv[1] cvobPath = argv[2] tagvobPath = argv[3] corpusPath = argv[4] outputPath = argv[5] '''load word2vector''' word_vob = w2v.Word2vecVocab() char_vob = w2v.Word2vecVocab() word_vob.Load(wvobPath) char_vob.Load(cvobPath) tag_vob = loadtagVob(tagvobPath) generate_train = Generate_train(char_vob, word_vob, tag_vob) processed_lines_gen = generate_train.process(corpusPath) print('total line:{}'.format(generate_train.total_line)) write_train_data(outputPath, processed_lines_gen, generate_train.total_line)
def main(argc, argv): if argc < 6: print( "Usage:%s <data> <word_vob> <char_vob> <train_output> <test_output>" % (argv[0])) train_output = open(argv[4], "w") test_output = open(argv[5], "w") word_vob = w2v.Word2vecVocab() word_vob.Load(argv[2]) char_vob = w2v.Word2vecVocab() char_vob.Load(argv[3]) with open(argv[1], 'r') as f: data = json.load(f) stat_max_len(data) train_data, test_data = build_dataset(data) generate_net_input(train_data, train_output, word_vob, char_vob) generate_net_input(test_data, test_output, word_vob, char_vob) train_output.close() test_output.close()
def main(argc, argv): if argc < 3: print( "Usage: %s <input> <output> [model | 0 for w2v , 1 for training] [vec_path | if mode if not 0]" % (argv[0])) sys.exit(1) mode = 0 vob = None if argc > 4: mode = int(argv[3]) vob = w2v.Word2vecVocab() vob.Load(argv[4]) inp = open(argv[1], "r") oup = open(argv[2], "w") processFile(inp, oup, mode, vob)
def doGen(inputPath, outputPath, vocabPath): global totalLine global longLine global totalChars vob = w2v.Word2vecVocab() vob.Load(vocabPath) with open(inputPath, "r") as inp: with open(outputPath, "w") as out: for line in inp.readlines(): line = line.strip() if not line: continue processLine(line, vob, out) print("total:%d, long lines:%d, chars:%d" % (totalLine, longLine, totalChars))
def convert(trainPath, trainOutPath, testOutPath, vocabPath, titleVobPath, partFrom=0, partEnd=9, testRatio=0.02): vocab = w2v.Word2vecVocab() vocab.Load(vocabPath) writerTrain = tf.python_io.TFRecordWriter(trainOutPath) writerTest = tf.python_io.TFRecordWriter(testOutPath) raseg.init_config("/var/local/seg/conf/qsegconf.ini") seg = raseg.ImTokenizer() titleVocab = {} load_title_dict(titleVobPath, titleVocab) numTag = len(titleVocab) + 1 npos = 0 nneg = 0 processed = 0 for i in range(partFrom, partEnd + 1): with open("%s/part-r-%05d" % (trainPath, i), "r") as fp: for line in fp.readlines(): line = line.strip() if not line: continue processed += 1 ss = line.split("\t") assert (len(ss) == 8) title = ss[0].lower() if title == '网络': title = '网络工程师' if title not in titleVocab: print("[%s] not there!! " % (title)) continue target = titleVocab[title] target_orgId = int(ss[1]) gender = int(ss[2]) age = int(ss[3]) location = int(ss[4]) edustrs = ss[5].split(" ") assert (len(edustrs) == 12) edu_expr1 = EduExperience(edustrs, 0) edu_expr2 = EduExperience(edustrs, 1) edu_expr3 = EduExperience(edustrs, 2) workstrs = ss[6].split(" ") assert (len(workstrs) == 18) work_expr1 = WorkExperience(workstrs, 0) workTokens = gen_sentence_features(work_expr1.desc, vocab, seg) work_expr2 = WorkExperience(workstrs, 1) workTokens += gen_sentence_features(work_expr2.desc, vocab, seg) work_expr3 = WorkExperience(workstrs, 2) workTokens += gen_sentence_features(work_expr3.desc, vocab, seg) projstrs = ss[7].split(" ") assert (len(projstrs) == 3) proj1 = "" if projstrs[0] != 'None': proj1 = base64.b64decode(projstrs[0]) proj2 = "" if projstrs[1] != 'None': proj2 = base64.b64decode(projstrs[1]) proj3 = "" if projstrs[2] != 'None': proj3 = base64.b64decode(projstrs[2]) projTokens = gen_sentence_features(proj1, vocab, seg) projTokens += gen_sentence_features(proj2, vocab, seg) projTokens += gen_sentence_features(proj3, vocab, seg) assert (len(workTokens) == (3 * MAX_TOKEN_NUM_PER_SENTENCE)) example = tf.train.Example(features=tf.train.Features( feature={ "target": tf.train.Feature(int64_list=tf.train.Int64List( value=[target])), "target_orgId": tf.train.Feature(int64_list=tf.train.Int64List( value=[target_orgId])), "gender": tf.train.Feature(int64_list=tf.train.Int64List( value=[gender])), "age": tf.train.Feature(int64_list=tf.train.Int64List( value=[age])), "location": tf.train.Feature(int64_list=tf.train.Int64List( value=[location])), "education_schools": tf.train.Feature(int64_list=tf.train.Int64List(value=[ edu_expr1.school, edu_expr2.school, edu_expr3.school ])), "education_degrees": tf.train.Feature(int64_list=tf.train.Int64List(value=[ edu_expr1.degree, edu_expr2.degree, edu_expr3.degree ])), "education_starts": tf.train.Feature(float_list=tf.train.FloatList(value=[ edu_expr1.start, edu_expr2.start, edu_expr3.start ])), "education_majors": tf.train.Feature(int64_list=tf.train.Int64List(value=[ edu_expr1.major, edu_expr2.major, edu_expr3.major ])), "work_expr_orgs": tf.train.Feature(int64_list=tf.train.Int64List(value=[ work_expr1.org, work_expr2.org, work_expr3.org ])), "work_expr_starts": tf.train.Feature(float_list=tf.train.FloatList(value=[ work_expr1.start, work_expr2.start, work_expr3.start ])), "work_expr_durations": tf.train.Feature(float_list=tf.train.FloatList(value=[ work_expr1.duaration, work_expr2.duaration, work_expr3.duaration ])), "work_expr_jobs": tf.train.Feature(int64_list=tf.train.Int64List(value=[ work_expr1.job, work_expr2.job, work_expr3.job ])), "work_expr_orgIds": tf.train.Feature(int64_list=tf.train.Int64List(value=[ work_expr1.orgId, work_expr2.orgId, work_expr3.orgId ])), "work_expr_descs": tf.train.Feature(int64_list=tf.train.Int64List( value=workTokens)), "proj_expr_descs": tf.train.Feature(int64_list=tf.train.Int64List( value=projTokens)), })) if random.random() <= testRatio: writerTest.write(example.SerializeToString()) nneg += 1 else: writerTrain.write(example.SerializeToString()) npos += 1 if processed % 200 == 0: print("processed %d, neg:%d, pos:%d....." % (processed, nneg, npos)) print("max len of sentences:%d" % (maxTokens)) writerTrain.close() writerTest.close()