def main(): #command line arguments parser = argparse.ArgumentParser( description= 'Convert RefGene Genbank protein files to tab-delimited database files', epilog= 'pypeline.refgene2db version 1.0β1 ©2011-2012 Michael Yourshaw all rights reserved' ) parser.add_argument('--ref', '-r', required=True, help='path to gene_RefSeqGene file') parser.add_argument('--input', '-i', nargs='+', help='downloaded refseqgeneN.genomic.gbff.gz files') parser.add_argument('--output', '-o', required=True, help='output file') args = parser.parse_args() locus2gene = {} with open(args.ref) as ref: for r in ref: if r.startswith('#'): continue r = r.rstrip('\n').split('\t') locus2gene[r[3].split('.')[0]] = r[2] inputs = my.unglob(args.input) locus2comment = {} locus_count = 0 refseq_re = re.compile(r"\s*\[provided by RefSeq.*\]", re.I) for input in inputs: f = my.open_gz_or_text(input) in_comment = False for line in f: if line[0:5] == "LOCUS": locus = line.split()[1] comment = "" locus_count += 1 elif line[0:7] == "COMMENT": in_comment = True comment += line.split(" ")[1].replace("\n", " ") elif line[0:7] == "PRIMARY": in_comment = False try: summary = comment.split( "Summary:" )[1] #.strip().split('[provided by RefSeq].')[0].rstrip() except: summary = comment #.strip().split('[provided by RefSeq].')[0].rstrip() locus2comment[locus] = refseq_re.split(summary)[0] elif in_comment: comment += line.split(" ")[1].replace("\n", " ") with open(args.output, 'w') as output: output.write('#NG_ID\tGeneSymbol\tSUMMARY\n') for locus in sorted(locus2comment): output.write('{}\t{}\t{}\n'.format(locus, locus2gene.get(locus, ''), locus2comment[locus]))
def main(): #command line arguments parser = argparse.ArgumentParser( description = 'Convert RefGene Genbank protein files to tab-delimited database files', epilog = 'pypeline.refgene2db version 1.0β1 ©2011-2012 Michael Yourshaw all rights reserved') parser.add_argument('--ref', '-r', required=True, help='path to gene_RefSeqGene file') parser.add_argument('--input', '-i', nargs='+', help='downloaded refseqgeneN.genomic.gbff.gz files') parser.add_argument('--output', '-o', required=True, help='output file') args = parser.parse_args() locus2gene = {} with open(args.ref) as ref: for r in ref: if r.startswith('#'): continue r = r.rstrip('\n').split('\t') locus2gene[r[3].split('.')[0]] = r[2] inputs = my.unglob(args.input) locus2comment = {} locus_count = 0 refseq_re = re.compile(r"\s*\[provided by RefSeq.*\]",re.I) for input in inputs: f = my.open_gz_or_text(input) in_comment=False for line in f: if line[0:5] == "LOCUS": locus = line.split()[1] comment = "" locus_count += 1 elif line[0:7] == "COMMENT": in_comment=True comment += line.split(" ")[1].replace("\n", " ") elif line[0:7] == "PRIMARY": in_comment = False try: summary = comment.split("Summary:")[1]#.strip().split('[provided by RefSeq].')[0].rstrip() except: summary = comment#.strip().split('[provided by RefSeq].')[0].rstrip() locus2comment[locus] = refseq_re.split(summary)[0] elif in_comment: comment += line.split(" ")[1].replace("\n", " ") with open(args.output,'w') as output: output.write('#NG_ID\tGeneSymbol\tSUMMARY\n') for locus in sorted(locus2comment): output.write('{}\t{}\t{}\n'.format(locus, locus2gene.get(locus,''), locus2comment[locus]))
def run(input, table=None, skip=None, header_line='#', column_names=None, no_column_names=False): header_line_number = int(header_line) if my.is_int(header_line) and int(header_line) > 0 else None header_chars = header_line if not my.is_int(header_line) else None output = os.path.join(os.path.dirname(input),table+'.mysql') if table else input+'.table.mysql' with my.open_gz_or_text(input) as input_file, open(output, 'w') as mysql: column_widths = {} column_names = {} column_names_list = None line_count = 0 first_data_line = None header_found = False if header_line_number or header_chars else True if column_names: header_found = True column_names_list = column_names column_names = {i:column_names_list[i] for i in range(len(column_names_list))} sql_column_spec = my.get_sql_column_spec(column_names_list) for line in input_file: line_count += 1 line = line.rstrip('\n') if not bool(line.strip()): continue if not header_found: if no_column_names: header_found = True column_names_list = ['col'+str(i+1) for i in range(len(line.split('\t')))] column_names = {i:column_names_list[i] for i in range(len(column_names_list))} sql_column_spec = my.get_sql_column_spec(column_names_list) elif (header_line_number and header_line_number == line_count) or (header_chars and header_chars != '#' and line.startswith(header_chars)) or (header_chars and header_chars == '#' and line.startswith(header_chars) and line[1] != '#'): header_found = True column_names_list = line.split('\t') column_names = {i:column_names_list[i] for i in range(len(column_names_list))} sql_column_spec = my.get_sql_column_spec(column_names_list) continue elif skip and line.startswith(skip): continue else: #data line if first_data_line == None: first_data_line = line_count fields = line.split('\t') indices = [i for i in range(len(fields))] column_widths = {i: max(len(fields[i]),column_widths.setdefault(i,0)) for i in indices} sql_data_dict = my.get_sql_data_dict(column_names, fields) my.update_sql_column_spec(sql_column_spec, sql_data_dict) pass #output.write('col_number\tcol_name\tcol_width\n') #output.write(''.join(['{}\t{}\t{}\n'.format(col+1, column_names[col], column_widths[col]) for col in sorted(column_widths)])) if not (column_names_list and sql_column_spec): raise SqlColumnsError('Could not identify columns. Try using --column_names or --no_column_names.') mysql_scripts = my.get_mysql_scripts( table_name=table if table else os.path.basename(input).replace('.','_')[:64], index_base_name=os.path.basename(input).replace('.','_')[:16], indices=[], columns_out=column_names_list, columns_out_spec=sql_column_spec, rows_to_delete=first_data_line-1 ) mysql.write(mysql_scripts[0])