def create_table_code(): code = "" for table in get_tables(): table_code = "{tablename} = get_table('{tablename}')".format(tablename=table) code = joinstr([code, table_code, '\n'], "") return code
def create_code(dbtype): codes = [table_code(table) for table in inspector.get_table_names()] code = joinstr(codes, "") try: _dbtypes = sqldbtypes[dbtype] except: _dbtypes = "" #print "_dbtypes ", _dbtypes return header.format(dbtypes=_dbtypes) + code
def table_code(table): ##print "------table = ", table columns = inspector.get_columns(table) ##print "columns = ", columns ##print "--------------" #create_column_code(columns[0]) columns_code = [create_column_code(col, table) for col in columns] columns_code_ = joinstr(columns_code, '\n') ##print columns_code_ return table_tpl.format(TableName=table, table=table, columns=columns_code_)
def create_column_code(col, table): fnkey_map = foreign_keys_map(table) ##print "fnkey_map = ", fnkey_map ##print "col = ", col bols = ["False", "True"] name = col['name'] nullable = col['nullable'] try: primary_key = col['primary_key'] except: primary_key = 0 default = col['default'] #import IPython ; IPython.embed() if type(col['type']) == sqlalchemy.sql.sqltypes.NullType: _type = "NullType()" else: _type = str(col['type']) if _type == "TIMESTAMP WITHOUT TIME ZONE": _type = "DATETIME()" if not _type.endswith(')'): _type += '()' if name in fnkey_map.keys(): ftable, fcolumn = fnkey_map[name] _foreign_keys = "ForeignKey(\"%s.%s\")" % (ftable, fcolumn) #print "_foreign_keys = ", _foreign_keys else: _foreign_keys = None code = [ '\"%s\"' % name, _type, _foreign_keys, "primary_key=%s" % str(primary_key), "nullable=%s" % nullable, #"default=%s" % str(default), ] _code = joinstr(code, ", ") return column_tpl.format(colname=name, data=_code)
def generate_sents(output_name, group='dev'): outdir = '{}/ico_acceptor/{}/'.format(utils.DATA_DIR, output_name) try: os.mkdir(outdir) except OSError: print('Target dir: {} already exists'.format(outdir)) sent_dir = '{}/documents/sents/'.format(utils.DATA_DIR) pmids = utils.group_ids(group) sample_c = open('{}/{}_sample_c.txt'.format(outdir, group), 'w') sample_o = open('{}/{}_sample_o.txt'.format(outdir, group), 'w') sample_ic = open('{}/{}_sample_ic.txt'.format(outdir, group), 'w') sample_co = open('{}/{}_sample_co.txt'.format(outdir, group), 'w') sample_none = open('{}/{}_sample_none.txt'.format(outdir, group), 'w') for pmid in pmids: all_ner_f = '{}/documents/txts/{}.ner_test'.format( utils.DATA_DIR, pmid) all_ner = json.loads(open(all_ner_f).read()) sents = utils.readlines('{}/{}.sents'.format(sent_dir, pmid)) ev_labels = [int(l) for l in \ utils.readlines('{}/{}.bert_ev_binary'.format(sent_dir, pmid))] ners = [json.loads(l) for l in \ open('{}/{}.ner_test'.format(sent_dir, pmid)).readlines()] frame_idx = 0 for sent_idx, (s, ev, ner) in enumerate(zip(sents, ev_labels, ners)): if ev: if s == 'ABSTRACT ': # data artifact due to text generation continue n_i = len(ner['i']) n_o = len(ner['o']) if n_i >= 2: i_pairs = combinations(ner['i'], 2) i_pairs = [(i, c) for i, c in i_pairs if i != c] if n_o >= 1: for (i, c), o in product(i_pairs, ner['o']): sample_none.write( utils.joinstr( [pmid, sent_idx, frame_idx, i, c, o, s])) sample_none.write('\n') frame_idx += 1 else: # n_o == 0 o_spans = all_ner['o'] for i, c in i_pairs: for o in o_spans: sample_o.write( utils.joinstr([ pmid, sent_idx, frame_idx, i, c, o, s ])) sample_o.write('\n') frame_idx += 1 elif n_i == 1: i = ner['i'][0] if n_o >= 1: c_spans = all_ner['i'] for o in ner['o']: for c in c_spans: sample_c.write( utils.joinstr([ pmid, sent_idx, frame_idx, i, c, o, s ])) sample_c.write('\n') frame_idx += 1 else: # n_o == 0 c_spans = [c for c in all_ner['i'] if c != i] o_spans = all_ner['o'] for c, o in product(c_spans, o_spans): sample_co.write( utils.joinstr( [pmid, sent_idx, frame_idx, i, c, o, s])) sample_co.write('\n') frame_idx += 1 else: # n_i == 0 if n_o >= 1: ic_pairs = combinations(all_ner['i'], 2) ic_pairs = [(i, c) for i, c in ic_pairs if i != c] for o in ner['o']: for i, c in ic_pairs: sample_ic.write( utils.joinstr([ pmid, sent_idx, frame_idx, i, c, o, s ])) sample_ic.write('\n') frame_idx += 1 else: # n_o == 0 pass # too hard! punt!
Compute the label frequency of a given input file and print to a csv format to output file """ from collections import defaultdict from docopt import docopt import logging logging.basicConfig(level=logging.DEBUG) import sys sys.path.append("./common") from utils import joinstr import pandas if __name__ == "__main__": args = docopt(__doc__) input_file = args["--in"] output_file = args["--out"] df = pandas.read_csv(input_file, sep='\t', header=0) labels_dic = dict([(label, df[df.label == label].shape[0]) for label in df.label.unique()]) total = df.label.shape[0] # Print to output file with open(output_file, 'w') as fout: for label, count in sorted(labels_dic.iteritems(), key=lambda (k, v): v, reverse=True): fout.write( joinstr(',', [ label, count, '{:.3f}'.format((float(count) / total) * 100) ]) + '\n')
if all([next_word.label == 'O' for (_, next_word) in words[word.word_id + 1 : ]]): # TODO: this is very inefficient postfix = 'E' else: postfix = word.label.split("-")[0] # Borrow label from last seen tag new_label = word.label # Reform line, only label is possibly changed ret.append([word.word_id, word.word, word.pred, word.pred_id, word.sent_id, word.run_id, new_label]) return ret if __name__ == "__main__": args = docopt(__doc__) input_file = args["--in"] output_file = args["--out"] # iterate over sentences df = pandas.read_csv(input_file, sep = '\t', header = 0) sents = [df[df.run_id == i] for i in range(min(df.run_id), max(df.run_id))] with open(output_file, 'w') as fout: # Write header fout.write(joinstr('\t', [k for k in df.keys()])) #Write sents: for sent in sents: for line in relabel(sent): fout.write(joinstr('\t', line)) fout.write('\n')