def main(word_file, pos_file, ngram_file, ngram_word_file, ngram_ai=None): if ngram_ai is None: ngram_id = ngram.max_id('ngram') else: ngram_id = ngram_ai pos_key = init_pos_key(pos_file) pos_file.close() for line in sys.stdin: try: words, freq = line.rstrip().split('\t') ngram_id += 1 words = words.split() print('%d\t%d\t%s' \ % (ngram_id, len(words), freq), file=ngram_file) for i, word in enumerate(words): try: w, pos = word.rsplit('_', 1) pid = pos_key[pos] wid = word_id(w, word_file) except: # no POS tag or invalid POS tag wid = word_id(word, word_file) pid = '\\N' print('%d\t%d\t%d\t%s' \ % (ngram_id, i, wid, pid), file=ngram_word_file) if ngram_id % PROGRESS == 0: print(ngram_id, file=sys.stderr) except Exception as e: print(e, file=sys.stderr) print(line)
def main(word_file, pos_file, ngram_file, ngram_word_file, ngram_ai=None): if ngram_ai is None: ngram_id = ngram.max_id('ngram') else: ngram_id = ngram_ai pos_key = init_pos_key(pos_file) pos_file.close() for line in sys.stdin: try: words, freq = line.rstrip().split('\t') ngram_id += 1 words = words.split() print >>ngram_file, '%d\t%d\t%s' \ % (ngram_id, len(words), freq) for i, word in enumerate(words): try: w, pos = word.rsplit('_', 1) pid = pos_key[pos] wid = word_id(w, word_file) except: # no POS tag or invalid POS tag wid = word_id(word, word_file) pid = '\\N' print >>ngram_word_file, '%d\t%d\t%d\t%s' \ % (ngram_id, i, wid, pid) if ngram_id % PROGRESS == 0: print >>sys.stderr, ngram_id except Exception, e: print >>sys.stderr, e print line
def main(word_file, pos_file, dep_file, arc_file, arc_word_file, n): print("Processing %d-arcs" % n, file=sys.stderr) arc_id = ngram.max_id('arc') + 1 for line in sys.stdin: try: entry = line.strip().split('\t', 3) freq = entry[2] print('%d\t%d\t%s' % (arc_id, n, freq), file=arc_file) words = entry[1].split() for i, word in enumerate(words): word, pos, dep, head_index = ngram.parse_word(word) wid = word_id(word, word_file) pid = pos_id(pos, pos_file) did = dep_id(dep, dep_file) print('%d\t%d\t%d\t%d\t%d\t%s' \ % (arc_id, i, wid, pid, did, head_index), file=arc_word_file) # if not skip_years: # for field in entry[3].split('\t'): # year, count = field.split(',') # print >>arc_freq_file, '%d\t%s\t%s' % (arc_id, year, count) except Exception as e: print(e, file=sys.stderr) print(line) else: arc_id += 1 if arc_id % PROGRESS == 0: print(arc_id, file=sys.stderr)
def main(word_file, pos_file, dep_file, arc_file, arc_word_file, n): print >>sys.stderr, "Processing %d-arcs" % n arc_id = ngram.max_id('arc') + 1 for line in sys.stdin: try: entry = line.strip().split('\t', 3) freq = entry[2] print >>arc_file, '%d\t%d\t%s' % (arc_id, n, freq) words = entry[1].split() for i, word in enumerate(words): word, pos, dep, head_index = ngram.parse_word(word) wid = word_id(word, word_file) pid = pos_id(pos, pos_file) did = dep_id(dep, dep_file) print >>arc_word_file, '%d\t%d\t%d\t%d\t%d\t%s' \ % (arc_id, i, wid, pid, did, head_index) # if not skip_years: # for field in entry[3].split('\t'): # year, count = field.split(',') # print >>arc_freq_file, '%d\t%s\t%s' % (arc_id, year, count) except Exception, e: print >>sys.stderr, e print line else: arc_id += 1 if arc_id % PROGRESS == 0: print >>sys.stderr, arc_id
def main(word_file, pos_file, ngram_file, ngram_word_file, ngram_freq_file): ngram_id = ngram.max_id('ngram') ngram.cur.close() ngram.db.close() cur_ngram = None total_freq = 0 for line in sys.stdin: try: entry = line.rstrip().split('\t') if entry[0] != cur_ngram: # new n-gram, import words # Write previous ngram to file if cur_ngram is not None: print('%d\t%d\t%d' \ % (ngram_id, len(words), total_freq), file=ngram_file) cur_ngram = entry[0] ngram_id += 1 total_freq = 0 words = entry[0].split() if ngram_id % PROGRESS == 0: print(ngram_id, file=sys.stderr) for i, word in enumerate(words): try: word, pos = word.split('_') pid = pos_id(pos, pos_file) except: pid = 'NULL' finally: wid = word_id(word, word_file) print('%d\t%d\t%d\t%s' \ % (ngram_id, i, wid, pid), file=ngram_word_file) #year = entry[1] freq = int(entry[2]) #vol = entry[3] #print >>ngram_freq_file, '%d\t%s\t%d\t%s' % (ngram_id, year, freq, vol) total_freq += freq except Exception as e: print(e, file=sys.stderr) print(line) # The last ngram print('%d\t%d\t%d' \ % (ngram_id, len(words), total_freq), file=ngram_file)
def main(word_file, pos_file, ngram_file, ngram_word_file, ngram_freq_file): ngram_id = ngram.max_id('ngram') ngram.cur.close() ngram.db.close() cur_ngram = None total_freq = 0 for line in sys.stdin: try: entry = line.rstrip().split('\t') if entry[0] != cur_ngram: # new n-gram, import words # Write previous ngram to file if cur_ngram is not None: print >>ngram_file, '%d\t%d\t%d' \ % (ngram_id, len(words), total_freq) cur_ngram = entry[0] ngram_id += 1 total_freq = 0 words = entry[0].split() if ngram_id % PROGRESS == 0: print >>sys.stderr, ngram_id for i, word in enumerate(words): try: word, pos = word.split('_') pid = pos_id(pos, pos_file) except: pid = 'NULL' finally: wid = word_id(word, word_file) print >>ngram_word_file, '%d\t%d\t%d\t%s' \ % (ngram_id, i, wid, pid) #year = entry[1] freq = int(entry[2]) #vol = entry[3] #print >>ngram_freq_file, '%d\t%s\t%d\t%s' % (ngram_id, year, freq, vol) total_freq += freq except Exception, e: print >>sys.stderr, e print line