Example #1
0
def main():

    #command line arguments
    parser = argparse.ArgumentParser(
        description=
        'Convert RefGene Genbank protein files to tab-delimited database files',
        epilog=
        'pypeline.refgene2db version 1.0β1 ©2011-2012 Michael Yourshaw all rights reserved'
    )
    parser.add_argument('--ref',
                        '-r',
                        required=True,
                        help='path to gene_RefSeqGene file')
    parser.add_argument('--input',
                        '-i',
                        nargs='+',
                        help='downloaded refseqgeneN.genomic.gbff.gz files')
    parser.add_argument('--output', '-o', required=True, help='output file')
    args = parser.parse_args()

    locus2gene = {}
    with open(args.ref) as ref:
        for r in ref:
            if r.startswith('#'):
                continue
            r = r.rstrip('\n').split('\t')
            locus2gene[r[3].split('.')[0]] = r[2]

    inputs = my.unglob(args.input)
    locus2comment = {}
    locus_count = 0
    refseq_re = re.compile(r"\s*\[provided by RefSeq.*\]", re.I)
    for input in inputs:
        f = my.open_gz_or_text(input)
        in_comment = False
        for line in f:
            if line[0:5] == "LOCUS":
                locus = line.split()[1]
                comment = ""
                locus_count += 1
            elif line[0:7] == "COMMENT":
                in_comment = True
                comment += line.split("    ")[1].replace("\n", " ")
            elif line[0:7] == "PRIMARY":
                in_comment = False
                try:
                    summary = comment.split(
                        "Summary:"
                    )[1]  #.strip().split('[provided by RefSeq].')[0].rstrip()
                except:
                    summary = comment  #.strip().split('[provided by RefSeq].')[0].rstrip()
                locus2comment[locus] = refseq_re.split(summary)[0]
            elif in_comment:
                comment += line.split("            ")[1].replace("\n", " ")
    with open(args.output, 'w') as output:
        output.write('#NG_ID\tGeneSymbol\tSUMMARY\n')
        for locus in sorted(locus2comment):
            output.write('{}\t{}\t{}\n'.format(locus,
                                               locus2gene.get(locus, ''),
                                               locus2comment[locus]))
Example #2
0
def main():

    #command line arguments
    parser = argparse.ArgumentParser(
        description = 'Convert RefGene Genbank protein files to tab-delimited database files',
        epilog = 'pypeline.refgene2db version 1.0β1 ©2011-2012 Michael Yourshaw all rights reserved')
    parser.add_argument('--ref', '-r', required=True,
        help='path to gene_RefSeqGene file')
    parser.add_argument('--input', '-i', nargs='+',
        help='downloaded refseqgeneN.genomic.gbff.gz files')
    parser.add_argument('--output', '-o', required=True,
        help='output file')
    args = parser.parse_args()
    
    locus2gene = {}
    with open(args.ref) as ref:
        for r in ref:
            if r.startswith('#'):
                continue
            r = r.rstrip('\n').split('\t')
            locus2gene[r[3].split('.')[0]] = r[2]
    
    inputs = my.unglob(args.input)
    locus2comment = {}
    locus_count = 0
    refseq_re = re.compile(r"\s*\[provided by RefSeq.*\]",re.I)
    for input in inputs:
        f = my.open_gz_or_text(input)
        in_comment=False
        for line in f:
            if line[0:5] == "LOCUS":
                locus = line.split()[1]
                comment = ""
                locus_count += 1
            elif line[0:7] == "COMMENT":
                in_comment=True
                comment += line.split("    ")[1].replace("\n", " ")
            elif line[0:7] == "PRIMARY":
                in_comment = False
                try:
                    summary = comment.split("Summary:")[1]#.strip().split('[provided by RefSeq].')[0].rstrip()
                except:
                    summary = comment#.strip().split('[provided by RefSeq].')[0].rstrip()
                locus2comment[locus] = refseq_re.split(summary)[0]
            elif in_comment:
                comment += line.split("            ")[1].replace("\n", " ")
    with open(args.output,'w') as output:
        output.write('#NG_ID\tGeneSymbol\tSUMMARY\n')
        for locus in sorted(locus2comment):
            output.write('{}\t{}\t{}\n'.format(locus, locus2gene.get(locus,''), locus2comment[locus]))
Example #3
0
def run(input, table=None, skip=None, header_line='#', column_names=None, no_column_names=False):
    
    header_line_number = int(header_line) if my.is_int(header_line) and int(header_line) > 0 else None
    header_chars = header_line if not my.is_int(header_line) else None
    
    output = os.path.join(os.path.dirname(input),table+'.mysql') if table else input+'.table.mysql'
    with my.open_gz_or_text(input) as input_file, open(output, 'w') as mysql:
        column_widths = {}
        column_names = {}
        column_names_list = None
        line_count = 0
        first_data_line = None
        header_found = False if header_line_number or header_chars else True
        if column_names:
            header_found = True
            column_names_list = column_names
            column_names = {i:column_names_list[i] for i in range(len(column_names_list))}
            sql_column_spec = my.get_sql_column_spec(column_names_list)
        for line in input_file:
            line_count += 1
            line = line.rstrip('\n')
            if not bool(line.strip()):
                continue
            if not header_found:
                if no_column_names:
                    header_found = True
                    column_names_list = ['col'+str(i+1) for i in range(len(line.split('\t')))]
                    column_names = {i:column_names_list[i] for i in range(len(column_names_list))}
                    sql_column_spec = my.get_sql_column_spec(column_names_list)
                elif (header_line_number and header_line_number == line_count) or (header_chars and header_chars != '#' and line.startswith(header_chars)) or (header_chars and header_chars == '#' and line.startswith(header_chars) and line[1] != '#'):
                    header_found = True
                    column_names_list = line.split('\t')
                    column_names = {i:column_names_list[i] for i in range(len(column_names_list))}
                    sql_column_spec = my.get_sql_column_spec(column_names_list)
                continue
            elif skip and line.startswith(skip):
                continue
            else: #data line
                if first_data_line == None:
                    first_data_line = line_count
                fields = line.split('\t')
                indices = [i for i in range(len(fields))]
                column_widths = {i: max(len(fields[i]),column_widths.setdefault(i,0)) for i in indices}
                sql_data_dict = my.get_sql_data_dict(column_names, fields)
                my.update_sql_column_spec(sql_column_spec, sql_data_dict)
                pass
        #output.write('col_number\tcol_name\tcol_width\n')
        #output.write(''.join(['{}\t{}\t{}\n'.format(col+1, column_names[col], column_widths[col]) for col in sorted(column_widths)]))

        if not (column_names_list and sql_column_spec):
            raise SqlColumnsError('Could not identify columns. Try using --column_names or --no_column_names.')

        mysql_scripts = my.get_mysql_scripts(
            table_name=table if table else os.path.basename(input).replace('.','_')[:64],
            index_base_name=os.path.basename(input).replace('.','_')[:16],
            indices=[],
            columns_out=column_names_list,
            columns_out_spec=sql_column_spec,
            rows_to_delete=first_data_line-1
            )
        mysql.write(mysql_scripts[0])