def process_source(genome_in, metadata=dict()):
    """
    Take source, uncompress, sort, and convert to GFF as needed, yield GFF
    """
    # Handle genome compression, get input, and make best guess of format type.
    source_input = autozip.file_open(genome_in, 'r')
    metadata['input_type'] = detect_format(source_input)

    # Reset input and convert to GFF if necessary.
    source_input.close()
    source_input = autozip.file_open(genome_in, 'r')

    if metadata['input_type'] == "GFF":
        gff_input = source_input
    elif metadata['input_type'] == "CGIVAR":
        gff_input = cgivar_to_gff.convert(source_input)
    elif metadata['input_type'] == "23ANDME":
        gff_input = gff_from_23andme.convert(source_input)
    else:
        print "ERROR: genome file format not recognized"

    # Grab header (don't sort) & genome build. Pipe the rest to UNIX sort.
    header_done = False
    header = []
    sort_cmd = ['sort', '--buffer-size=20%', '--key=1,1', '--key=5n,5', '--key=4n,4']
    sort_out = subprocess.Popen(sort_cmd, stdin=subprocess.PIPE, 
                                stdout=subprocess.PIPE, bufsize=1)
    genome_build = DEFAULT_BUILD
    b36_list = ["hg18", "36", "b36", "build36", "NCBI36"]
    b37_list = ["hg19", "37", "b37", "build37", "GRCh37"]
    for line in gff_input:
        if not header_done:
            if re.match('#', line):
                header.append(line)
                if line.startswith("##genome-build"):
                    gbdata = line.split()
                    if len(gbdata) < 2:
                        raise Exception("no genome build specified?")
                    elif gbdata[1] in b36_list:
                        genome_build = "b36"
                    elif gbdata[1] in b37_list:
                        genome_build = "b37"
                    else:
                        raise Exception("genome build uninterpretable")
            else:
                header_done = True
        else:
            sort_out.stdin.write(str(line.rstrip('\n')) + '\n')
    sort_out.stdin.close()

    # Yield the genome build, followed by the GFF data.
    yield genome_build
    for line in header:
        yield line.rstrip('\n')
    for line in sort_out.stdout:
        yield line.rstrip('\n')
def read_metadata(genome_id):
    """Open file containing metadata, return it"""
    metadata_path = '/home/trait/upload/' + genome_id + '-out/metadata.json'
    f_meta = autozip.file_open(metadata_path)
    metadata = json.loads(f_meta.next())
    f_meta.close()
    return metadata
def read_metadata(genome_id):
    """Open file containing metadata, return it"""
    metadata_path = "/home/trait/upload/" + genome_id + "-out/metadata.json"
    f_meta = autozip.file_open(metadata_path)
    metadata = json.loads(f_meta.next())
    f_meta.close()
    return metadata
Exemple #4
0
def convert(cgi_input, options=None):
    """Generator that converts CGI var data to GFF-formated strings"""
    # Set up CGI input. Default is to assume a str generator.
    cgi_data = cgi_input
    if isinstance(cgi_input, str):
        cgi_data = autozip.file_open(cgi_input, 'r')

    build = DEFAULT_BUILD
    software_ver = DEFAULT_SOFTWARE_VER
    header_done = False
    saw_chromosome = False
    for line in cgi_data:
        # Handle the header, get the genome build if you can.
        if not header_done:
            if re.match("#", line):
                if re.match("#GENOME_REFERENCE.*NCBI build 37", line):
                    build = "b37"
                elif re.match("#GENOME_REFERENCE.*NCBI build 36", line):
                    build = "b36"
                if re.match("#SOFTWARE_VERSION\W+([0-9.]+)", line):
                    matches = re.match("#SOFTWARE_VERSION\W+([0-9.]+)",
                                       line).groups()
                    software_ver = matches[0]
                continue
            else:
                # Output GFF header once we're done reading CGI's.
                yield "##genome-build " + build
                header_done = True
        if re.search("^\W*$", line):
            continue
        # TODO: use table header instead of assuming which column to use
        if re.search("^>", line):
            continue

        # Handle data
        data = line.rstrip('\n').split("\t")

        if options and options.chromosome:
            if data[3] != options.chromosome:
                if saw_chromosome:
                    # Assume all base calls for a single chromosome are in a contiguous block
                    break
                continue
            saw_chromosome = True

        if data[2] == "all" or data[1] == "1":
            # The output from process_full_position is a str.
            out = process_full_position(data, software_ver)
        else:
            assert data[2] == "1"
            # The output from process_split_position is a str generator;
            # it may end up calling itself recursively.
            out = process_split_position(data, cgi_data, software_ver)
        if not out:
            continue
        if isinstance(out, str):
            yield out
        else:
            for line in out:
                yield line
Exemple #5
0
def main():
    """Main function."""
    usage = 'usage: %prog [options] gff_child gff_parentA [gff_parentB]'
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-o', '--output', help="Specificies an option output " \
        + "file name. Default is standard output.", dest='f_out',
        action='store')
    parser.add_option('-m', '--mend_errs', help="If set, report mendelian " \
        + "inheritance errors as an attribute. Default is to ignore them.",
        dest='mend_errs', action='store_true', default=False)
    (opts, args) = parser.parse_args()

    if (len(args) < 2):
        parser.error("Need atleast 2 input file arguments.")

    child = args[0]
    parent_a = args[1]
    parent_b = None
    if (len(args) > 2):
        parent_b = args[2]

    trioizer = PhaseTrio(child, parent_a, parent_b, opts.mend_errs)
    if opts.f_out:
        out = autozip.file_open(opts.f_out, 'w')
        for line in trioizer.call_phase():
            out.write('%s\n' % line)
    else:
        for line in trioizer.call_phase():
            print '%s\n' % line
def match_getev_pph2(getev_file, pph2_file):
    # Use these files to create a link between uniprot ID and gene name.
    kgwname_file = '/home/trait/data/knownGene_hg18_sorted.txt'
    kgxref_file = '/home/trait/data/kgXref_hg18.txt.gz'
    ucsc_to_name = process_kgwname(kgwname_file)
    uniprot_to_genename = process_kgxref(kgxref_file, ucsc_to_name)

    # Read GET-Evidence flatfile
    getev_variants = dict()
    if isinstance(getev_file, str):
        getev_in = autozip.file_open(getev_file)
    else:
        getev_in = getev_file
    for line in getev_in:
        getev_data = json.loads(line)
        if 'gene' in getev_data and 'aa_change_short' in getev_data:
            gene_aachange_key = (getev_data['gene'] + '-' + 
                                 getev_data['aa_change_short'])
            getev_variants[gene_aachange_key] = getev_data['variant_id']
    getev_in.close()


    # Read Polyphen 2 data and return scores for GET-Ev variants
    pph2_tar = tarfile.open(name=pph2_file, mode='r:bz2')
    for taritem in pph2_tar:
        if re.match('pph2_whpss/(.*)\.pph2\.txt', str(taritem.name)):
            uniprot = re.match('pph2_whpss/(.*)\.pph2\.txt', taritem.name).group(1)
            if uniprot in uniprot_to_genename:
                gene = uniprot_to_genename[uniprot]
                pph2_genedata = pph2_tar.extractfile(taritem)
                for line in pph2_genedata:
                    pph2_data = re.split(' *\t *', line.rstrip('\n'))
                    key = gene + '-' + pph2_data[3] + pph2_data[2] + pph2_data[4]
                    if key in getev_variants and pph2_data[16]:
                        print '\t'.join([key, getev_variants[key], pph2_data[16]])
def main():
    """Main function."""
    usage = 'usage: %prog [options] gff_child gff_parentA [gff_parentB]'
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-o', '--output', help="Specificies an option output " \
        + "file name. Default is standard output.", dest='f_out',
        action='store')
    parser.add_option('-m', '--mend_errs', help="If set, report mendelian " \
        + "inheritance errors as an attribute. Default is to ignore them.",
        dest='mend_errs', action='store_true', default=False)
    (opts, args) = parser.parse_args()

    if (len(args) < 2):
        parser.error("Need atleast 2 input file arguments.")

    child = args[0]
    parent_a = args[1]
    parent_b = None
    if (len(args) > 2):
        parent_b = args[2]

    trioizer = PhaseTrio(child, parent_a, parent_b, opts.mend_errs)
    if opts.f_out:
        out = autozip.file_open(opts.f_out, 'w')
        for line in trioizer.call_phase():
            out.write('%s\n' % line)
    else:
        for line in trioizer.call_phase():
            print '%s\n' % line
def convert(cgi_input, options=None):
    """Generator that converts CGI var data to GFF-formated strings"""
    # Set up CGI input. Default is to assume a str generator.
    cgi_data = cgi_input
    if isinstance(cgi_input, str): 
        cgi_data = autozip.file_open(cgi_input, 'r')
     
    build = DEFAULT_BUILD
    software_ver = DEFAULT_SOFTWARE_VER
    header_done = False
    saw_chromosome = False
    for line in cgi_data:
        # Handle the header, get the genome build if you can.
        if not header_done:
            if re.match("#", line):
                if re.match("#GENOME_REFERENCE.*NCBI build 37", line): 
                    build = "b37"
                elif re.match("#GENOME_REFERENCE.*NCBI build 36", line): 
                    build = "b36"
                if re.match("#SOFTWARE_VERSION\W+([0-9.]+)", line):
                    matches = re.match("#SOFTWARE_VERSION\W+([0-9.]+)", 
                                       line).groups()
                    software_ver = matches[0]
                continue
            else:
                # Output GFF header once we're done reading CGI's.
                yield "##genome-build " + build
                header_done = True
        if re.search("^\W*$", line): 
            continue
        # TODO: use table header instead of assuming which column to use
        if re.search("^>", line): 
            continue

        # Handle data
        data = line.rstrip('\n').split("\t")

        if options and options.chromosome:
            if data[3] != options.chromosome:
                if saw_chromosome:
                    # Assume all base calls for a single chromosome are in a contiguous block
                    break
                continue
            saw_chromosome = True

        if data[2] == "all" or data[1] == "1":
            # The output from process_full_position is a str.
            out = process_full_position(data, software_ver)
        else:
            assert data[2] == "1"
            # The output from process_split_position is a str generator;
            # it may end up calling itself recursively.
            out = process_split_position(data, cgi_data, software_ver)
        if not out: 
            continue
        if isinstance(out, str): 
            yield out
        else: 
            for line in out: 
                yield line
Exemple #9
0
def match_getev_pph2(getev_file, pph2_file):
    # Use these files to create a link between uniprot ID and gene name.
    kgwname_file = '/home/trait/data/knownGene_hg18_sorted.txt'
    kgxref_file = '/home/trait/data/kgXref_hg18.txt.gz'
    ucsc_to_name = process_kgwname(kgwname_file)
    uniprot_to_genename = process_kgxref(kgxref_file, ucsc_to_name)

    # Read GET-Evidence flatfile
    getev_variants = dict()
    if isinstance(getev_file, str):
        getev_in = autozip.file_open(getev_file)
    else:
        getev_in = getev_file
    for line in getev_in:
        getev_data = json.loads(line)
        if 'gene' in getev_data and 'aa_change_short' in getev_data:
            gene_aachange_key = (getev_data['gene'] + '-' + 
                                 getev_data['aa_change_short'])
            getev_variants[gene_aachange_key] = getev_data['variant_id']
    getev_in.close()


    # Read Polyphen 2 data and return scores for GET-Ev variants
    pph2_tar = tarfile.open(name=pph2_file, mode='r:bz2')
    for taritem in pph2_tar:
        if re.match('pph2_whpss/(.*)\.pph2\.txt', str(taritem.name)):
            uniprot = re.match('pph2_whpss/(.*)\.pph2\.txt', taritem.name).group(1)
            if uniprot in uniprot_to_genename:
                gene = uniprot_to_genename[uniprot]
                pph2_genedata = pph2_tar.extractfile(taritem)
                for line in pph2_genedata:
                    pph2_data = re.split(' *\t *', line.rstrip('\n'))
                    key = gene + '-' + pph2_data[3] + pph2_data[2] + pph2_data[4]
                    if key in getev_variants and pph2_data[16]:
                        print '\t'.join([key, getev_variants[key], pph2_data[16]])
def convert_to_file(genotype_input, output_file):
    """Convert a deCODEme file and output GFF-formatted data to file"""
    output = output_file  # default assumes writable file object
    if isinstance(output_file, str):
        output = autozip.file_open(output_file, 'w')
    conversion = convert(genotype_input)
    for line in conversion:
        output.write(line + "\n")
    output.close()
Exemple #11
0
def process_kgwname(kgwname_file):
    """Return dict linking UCSC IDs to gene names from first column"""
    ucsc_to_name = dict()
    f_in = autozip.file_open(kgwname_file, 'r')
    for line in f_in:
        data = line.split()
        ucsc_to_name[data[1]] = data[0]
    f_in.close()
    return ucsc_to_name
def convert_to_file(cgi_input, output_file):
    """Convert a CGI var file and output GFF-formatted data to file"""
    output = output_file  # default assumes writable file object
    if isinstance(output_file, str): 
        output = autozip.file_open(output_file, 'w')
    conversion = convert(cgi_input)  # set up generator
    for line in conversion:
        output.write(line + "\n")
    output.close()
Exemple #13
0
def convert_to_file(genotype_input, output_file):
    """Convert a Family Tree DNA file and output GFF-formatted data to file"""
    output = output_file  # default assumes writable file object
    if isinstance(output_file, str):
        output = autozip.file_open(output_file, 'w')
    conversion = convert(genotype_input)
    for line in conversion:
        output.write(line + "\n")
    output.close()
Exemple #14
0
def getev_reprocess(genotype_file, server=None, options=None):
    """Redo analysis against GET-Evidence data"""
    init_stuff = processing_init(genotype_file, server)
    if init_stuff:
        output_dir, log, log_handle, lockfile, logfile = init_stuff
    else:
        return None
    log.put('#status 0 Reprocessing data against GET-Evidence')
    args = {
        'metadata': os.path.join(output_dir, 'metadata.json'),
        'nonsyn_data': os.path.join(output_dir, 'ns.gff'),
        'getev_out': os.path.join(output_dir, 'get-evidence.json'),
        'getev_genes_out': os.path.join(output_dir, 'get-ev_genes.json'),
        'getev_flat': os.path.join(os.getenv('DATA'), GETEV_FLAT)
    }
    # Read metadata file (need this to get build info for transcripts file)
    try:
        f_metadata = autozip.file_open(args['metadata'])
        metadata = json.loads(f_metadata.next())
        f_metadata.close()
        if metadata['genome_build'] == 'b36':
            args['transcripts'] = os.path.join(os.getenv('DATA'),
                                               KNOWNGENE_HG18_SORTED)
        elif metadata['genome_build'] == 'b37':
            args['transcripts'] = os.path.join(os.getenv('DATA'),
                                               KNOWNGENE_HG19_SORTED)
        else:
            raise KeyError
    except (IOError, KeyError):
        fcntl.flock(log_handle, fcntl.LOCK_UN)
        log_handle.close()
        genome_analyzer(genotype_file)
        return

    if (os.path.exists(args['nonsyn_data'] + '.gz')):
        args['nonsyn_data'] = args['nonsyn_data'] + '.gz'

    if options and options.chromosome:
        chrlist = [options.chromosome]
    else:
        chrlist = ['chr' + str(x) for x in range(1, 22) + ['X', 'Y']]
    progtrack = ProgressTracker(log_handle, [1, 99], expected=chrlist)

    # Get GET-Evidence hits
    gff_getevidence_map.match_getev_to_file(
        args['nonsyn_data'],
        args['getev_flat'],
        transcripts_file=args['transcripts'],
        output_file=args['getev_out'] + ".tmp",
        gene_out_file=args['getev_genes_out'] + ".tmp",
        progresstracker=progtrack)
    os.system("mv " + args['getev_out'] + ".tmp " + args['getev_out'])
    os.system("mv " + args['getev_genes_out'] + ".tmp " +
              args['getev_genes_out'])
    os.rename(lockfile, logfile)
    log_handle.close()
    print "Finished reprocessing GET-Evidence hits for " + str(genotype_file)
def process_kgwname(kgwname_file):
    """Return dict linking UCSC IDs to gene names from first column"""
    ucsc_to_name = dict()
    f_in = autozip.file_open(kgwname_file, 'r')
    for line in f_in:
        data = line.split()
        ucsc_to_name[data[1]] = data[0]
    f_in.close()
    return ucsc_to_name
Exemple #16
0
def convert_to_file(vcf_input, output_file):
    """Convert a VCF file and output GFF-formatted data to file"""
    output = output_file  # default assumes writable file object
    if isinstance(output_file, str):
        output = autozip.file_open(output_file, 'w')
    conversion = convert(vcf_input)  # set up generator
    for line in conversion:
        output.write(line + "\n")
    output.close()
def getev_reprocess(genotype_file, server=None, options=None):
    """Redo analysis against GET-Evidence data"""
    init_stuff = processing_init(genotype_file, server)
    if init_stuff:
        output_dir, log, log_handle, lockfile, logfile = init_stuff
    else:
        return None
    log.put("#status 0 Reprocessing data against GET-Evidence")
    args = {
        "metadata": os.path.join(output_dir, "metadata.json"),
        "nonsyn_data": os.path.join(output_dir, "ns.gff"),
        "getev_out": os.path.join(output_dir, "get-evidence.json"),
        "getev_genes_out": os.path.join(output_dir, "get-ev_genes.json"),
        "getev_flat": os.path.join(os.getenv("DATA"), GETEV_FLAT),
    }
    # Read metadata file (need this to get build info for transcripts file)
    try:
        f_metadata = autozip.file_open(args["metadata"])
        metadata = json.loads(f_metadata.next())
        f_metadata.close()
        if metadata["genome_build"] == "b36":
            args["transcripts"] = os.path.join(os.getenv("DATA"), KNOWNGENE_HG18_SORTED)
        elif metadata["genome_build"] == "b37":
            args["transcripts"] = os.path.join(os.getenv("DATA"), KNOWNGENE_HG19_SORTED)
        else:
            raise KeyError
    except (IOError, KeyError):
        fcntl.flock(log_handle, fcntl.LOCK_UN)
        log_handle.close()
        genome_analyzer(genotype_file)
        return

    if os.path.exists(args["nonsyn_data"] + ".gz"):
        args["nonsyn_data"] = args["nonsyn_data"] + ".gz"

    if options and options.chromosome:
        chrlist = [options.chromosome]
    else:
        chrlist = ["chr" + str(x) for x in range(1, 22) + ["X", "Y"]]
    progtrack = ProgressTracker(log_handle, [1, 99], expected=chrlist)

    # Get GET-Evidence hits
    gff_getevidence_map.match_getev_to_file(
        args["nonsyn_data"],
        args["getev_flat"],
        transcripts_file=args["transcripts"],
        output_file=args["getev_out"] + ".tmp",
        gene_out_file=args["getev_genes_out"] + ".tmp",
        progresstracker=progtrack,
    )
    os.system("mv " + args["getev_out"] + ".tmp " + args["getev_out"])
    os.system("mv " + args["getev_genes_out"] + ".tmp " + args["getev_genes_out"])
    os.rename(lockfile, logfile)
    log_handle.close()
    print "Finished reprocessing GET-Evidence hits for " + str(genotype_file)
def getev_reprocess(genotype_file, server=None, options=None):
    """Redo analysis against GET-Evidence data"""
    init_stuff = processing_init(genotype_file, server)
    if init_stuff:
        output_dir, log, log_handle, lockfile, logfile = init_stuff
    else:
        return None
    log.put('#status 0 Reprocessing data against GET-Evidence')
    args = { 'metadata': os.path.join(output_dir, 'metadata.json'), 
             'nonsyn_data': os.path.join(output_dir, 'ns.gff'),
             'getev_out': os.path.join(output_dir, 'get-evidence.json'),
             'getev_genes_out': os.path.join(output_dir, 'get-ev_genes.json'),
             'getev_flat': os.path.join(os.getenv('DATA'), GETEV_FLAT)
             }
    # Read metadata file (need this to get build info for transcripts file)
    try:
        f_metadata = autozip.file_open(args['metadata'])
        metadata = json.loads(f_metadata.next())
        f_metadata.close()
        if metadata['genome_build'] == 'b36':
            args['transcripts'] = os.path.join(os.getenv('DATA'),
                                               KNOWNGENE_HG18_SORTED)
        elif metadata['genome_build'] == 'b37':
            args['transcripts'] = os.path.join(os.getenv('DATA'),
                                               KNOWNGENE_HG19_SORTED)
        else:
            raise KeyError
    except (IOError, KeyError):
        fcntl.flock(log_handle, fcntl.LOCK_UN)
        log_handle.close()
        genome_analyzer(genotype_file)
        return
    
    if (os.path.exists (args['nonsyn_data'] + '.gz')):
        args['nonsyn_data'] = args['nonsyn_data'] + '.gz'

    if options and options.chromosome:
        chrlist = [options.chromosome]
    else:
        chrlist = ['chr' + str(x) for x in range(1, 22) + ['X', 'Y']]
    progtrack = ProgressTracker(log_handle, [1, 99], expected=chrlist)

    # Get GET-Evidence hits
    gff_getevidence_map.match_getev_to_file(args['nonsyn_data'],
                                            args['getev_flat'],
                                            transcripts_file=args['transcripts'],
                                            output_file=args['getev_out'] + ".tmp",
                                            gene_out_file=args['getev_genes_out'] + ".tmp",
                                            progresstracker=progtrack)
    os.system("mv " + args['getev_out'] + ".tmp " + args['getev_out'])
    os.system("mv " + args['getev_genes_out'] + ".tmp " + args['getev_genes_out'])
    os.rename(lockfile, logfile)
    log_handle.close()
    print "Finished reprocessing GET-Evidence hits for " + str(genotype_file)
def convert(genotype_input):
    """Take in Ancestry genotype data, yield GFF formatted lines"""
    genotype_data = genotype_input
    if isinstance(genotype_input, str):
        genotype_data = autozip.file_open(genotype_input, 'r')
    build = DEFAULT_BUILD
    header_done = False
    for line in genotype_data:
        # Handle the header, get the genome build if you can.
        if not header_done:
            if re.match("#", line):
                if re.search("reference build 37", line):
                    build = "b37"
                elif re.search("reference build 38", line):
                    build = "b38"
                elif re.search("reference build 36", line):
                    build = "b36"
                continue
            else:
                yield "##genome-build " + build
                header_done = True
        data = line.rstrip('\n').split()
        if len(data) < 5:
            continue
        if (data[1] == "MT") or (data[1] == "25"):
            chromosome = 'chrM'
        elif (data[1] == "23"):
            chromosome = 'chrX'
        elif (data[1] == "24"):
            chromosome = 'chrY'
        else:
            chromosome = 'chr' + data[1]
        pos_start = data[2]
        pos_end = data[2]

        # Ignore uncalled or indel positions.
        if not (re.match(r'[ACGT]', data[3])):
            continue
        if not (re.match(r'[ACGT]', data[4])):
            continue
        if data[3] == data[4]:
            attributes = 'alleles ' + data[3]
        else:
            attributes = 'alleles ' + data[3] + '/' + data[4]

        if re.match('rs', data[0]):
            attributes = attributes + '; db_xref dbsnp:' + data[0]
        output = [
            chromosome, "CGI", "SNP", pos_start, pos_end, '.', '+', '.',
            attributes
        ]
        yield "\t".join(output)
Exemple #20
0
    def __init__(self, f_child, f_parA, f_parB, mend_errs):
        """Initializes class variables, opens input files."""
        self.filenames = {0: f_child, 1: f_parA}
        self.mend_errs = mend_errs
        self.gffs = {0: None, 1: None}
        # Positions are a tuple of chromosome, start, end, and gff record
        self.positions = {0: ('chr1', -1, -1, None), 1: ('chr1', -1, -1, None)}
        if (not f_parB == None):
            self.filenames[2] = f_parB
            self.gffs[2] = None
            self.positions[2] = ('chr1', -1, -1, None)

        # Set up input/output files
        for idx, filename in self.filenames.iteritems():
            self.gffs[idx] = gff.input(autozip.file_open(filename, 'r'))
 def __init__(self, f_child, f_parA, f_parB, mend_errs):
     """Initializes class variables, opens input files."""
     self.filenames = {0 : f_child, 1 : f_parA}
     self.mend_errs = mend_errs
     self.gffs = {0 : None, 1 : None}
     # Positions are a tuple of chromosome, start, end, and gff record
     self.positions = {0 : ('chr1', -1, -1, None),
                       1 : ('chr1', -1, -1, None)}
     if (not f_parB == None):
         self.filenames[2] = f_parB
         self.gffs[2] = None
         self.positions[2] = ('chr1', -1, -1, None)
 
     # Set up input/output files
     for idx, filename in self.filenames.iteritems():
         self.gffs[idx] = gff.input(autozip.file_open(filename, 'r'))
def convert(genotype_input):
    """Take in deCODEme genotype data, yield GFF formatted lines"""
    genotype_data = genotype_input
    if isinstance(genotype_input, str):
        genotype_data = csv.reader(
            autozip.file_open(genotype_input, 'r', 'deCODEme_scan.csv'))
    else:
        genotype_data = csv.reader(genotype_input)

    # We are allowing people to donate only the 'deCODEme_scan.csv' file,
    # which unfortunately lacks build information (it is stored separately
    # in 'deCODEme_info.txt', but this file also contains the deCODEme
    # username). So fare deCODEme files have only been build 36, and so
    # this is the current assumption for data processing.
    build = "b36"
    yield "##genome-build " + build

    header_row = genotype_data.next()
    col = dict()
    for i in range(len(header_row)):
        col[header_row[i]] = i

    for row in genotype_data:
        variants = list(row[col['YourCode']])
        if variants[0] == '-':
            continue
        chromosome = 'chr' + row[col['Chromosome']]
        strand = row[col['Strand']]
        if strand == '-':
            variants = [revcomp(x) for x in variants]
        pos_start = row[col['Position']]
        pos_end = pos_start

        attributes = ''
        if variants[0] == variants[1]:
            attributes = 'alleles ' + variants[0]
        else:
            attributes = 'alleles ' + variants[0] + '/' + variants[1]
        if re.match('rs', row[col['Name']]):
            attributes = attributes + '; db_xref dbsnp:' + row[col['Name']]

        output = [
            chromosome, "deCODEme", "SNP", pos_start, pos_end, '.', '+', '.',
            attributes
        ]
        yield "\t".join(output)
def convert(genotype_input):
    """Take in deCODEme genotype data, yield GFF formatted lines"""
    genotype_data = genotype_input
    if isinstance(genotype_input, str):
        genotype_data = csv.reader(
            autozip.file_open(genotype_input, 'r', 'deCODEme_scan.csv'))
    else:
        genotype_data = csv.reader(genotype_input)

    # We are allowing people to donate only the 'deCODEme_scan.csv' file,
    # which unfortunately lacks build information (it is stored separately
    # in 'deCODEme_info.txt', but this file also contains the deCODEme
    # username). So fare deCODEme files have only been build 36, and so
    # this is the current assumption for data processing.
    build = "b36"
    yield "##genome-build " + build

    header_row = genotype_data.next()
    col = dict()
    for i in range(len(header_row)):
        col[header_row[i]] = i

    for row in genotype_data:
        variants = list(row[col['YourCode']])
        if variants[0] == '-':
            continue
        chromosome = 'chr' + row[col['Chromosome']]
        strand = row[col['Strand']]
        if strand == '-':
            variants = [revcomp(x) for x in variants]
        pos_start = row[col['Position']]
        pos_end = pos_start

        attributes = ''
        if variants[0] == variants[1]:
            attributes = 'alleles ' + variants[0]
        else:
            attributes = 'alleles ' + variants[0] + '/' + variants[1]
        if re.match('rs', row[col['Name']]):
            attributes = attributes + '; db_xref dbsnp:' + row[col['Name']]

        output = [
            chromosome, "deCODEme", "SNP", pos_start, pos_end, '.', '+', '.',
            attributes
        ]
        yield "\t".join(output)
def convert(genotype_input):
    """Take in 23andme genotype data, yield GFF formatted lines"""
    genotype_data = genotype_input
    if isinstance(genotype_input, str):
        genotype_data = autozip.file_open(genotype_input, 'r')
    build = DEFAULT_BUILD
    header_done = False
    for line in genotype_data:
        # Handle the header, get the genome build if you can.
        if not header_done:
            if re.match("#", line):
                if re.search("human assembly build 37", line):
                    build = "b37"
                elif re.search("human assembly build 36", line):
                    build = "b36"
                continue
            else:
                yield "##genome-build " + build
                header_done = True
        data = line.rstrip('\n').split()
        if len(data) < 3:
            continue
        if data[1] == "MT":
            chromosome = 'chrM'
        else:
            chromosome = 'chr' + data[1]
        pos_start = data[2]
        pos_end = data[2]
        # Ignore uncalled or indel positions.
        if not (re.match(r'[ACGT]{1,2}', data[3])):
            continue
        if len(data[3]) > 1:
            if data[3][0] == data[3][1]:
                attributes = 'alleles ' + data[3][0]
            else:
                attributes = 'alleles ' + data[3][0] + '/' + data[3][1]
        else:
            attributes = 'alleles ' + data[3]
        if re.match('rs', data[0]):
            attributes = attributes + '; db_xref dbsnp:' + data[0]
        output = [chromosome, "CGI", "SNP", pos_start, pos_end, '.', '+', 
                  '.', attributes]
        yield "\t".join(output)
Exemple #25
0
def convert(input_file, options=None):
    input_type = detect_format.detect_format(input_file)
    if input_type == 'GFF':
        input_data = autozip.file_open(input_file)
    elif input_type == 'CGIVAR':
        input_data = cgivar_to_gff.convert(input_file, options)
    elif input_type == '23ANDME':
        input_data = gff_from_23andme.convert(input_file)
    elif input_type == 'VCF':
        input_data = vcf_to_gff.convert(input_file, options)
    elif input_type == 'deCODEme':
        input_data = gff_from_decodeme.convert(input_file)
    elif input_type == 'FTDNA':
        input_data = gff_from_ftdna.convert(input_file)
    else:
        raise Exception("input format not recognized")

    for line in input_data:
        yield line
def convert(input_file, options=None):
    input_type = detect_format.detect_format(input_file)

    if input_type == 'GFF':
        input_data = autozip.file_open(input_file)
    elif input_type == 'CGIVAR':
        input_data = cgivar_to_gff.convert(input_file, options)
    elif input_type == '23ANDME':
        input_data = gff_from_23andme.convert(input_file)
    elif input_type == 'VCF':
        input_data = vcf_to_gff.convert(input_file, options)
    elif input_type == 'deCODEme':
        input_data = gff_from_decodeme.convert(input_file)
    elif input_type == 'FTDNA':
        input_data = gff_from_ftdna.convert(input_file)
    else:
        raise Exception("input format not recognized")

    for line in input_data:
        yield line
Exemple #27
0
def convert(genotype_input):
    """Take in Family Tree genotype data, yield GFF formatted lines"""
    genotype_data = genotype_input
    if isinstance(genotype_input, str):
        genotype_data = csv.reader(autozip.file_open(genotype_input, 'r'))
    else:
        genotype_data = csv.reader(genotype_input)

    # Currently Family Tree DNA appears to only be in build 36 format.
    # There doesn't appear to be any record in the files regarding which
    # build was used.
    build = "b36"
    yield "##genome-build " + build

    header_row = genotype_data.next()
    col = dict()
    for i in range(len(header_row)):
        col[header_row[i]] = i

    for row in genotype_data:
        variants = list(row[col['RESULT']])
        if variants[0] == '-' or variants[0] == 'I' or variants[0] == 'D':
            continue
        chromosome = 'chr' + row[col['CHROMOSOME']]
        pos_start = row[col['POSITION']]
        pos_end = pos_start

        attributes = ''
        if variants[0] == variants[1]:
            attributes = 'alleles ' + variants[0]
        else:
            attributes = 'alleles ' + variants[0] + '/' + variants[1]
        if re.match('rs', row[col['RSID']]):
            attributes = attributes + '; db_xref dbsnp:' + row[col['RSID']]

        output = [
            chromosome, "FTDNA", "SNP", pos_start, pos_end, '.', '+', '.',
            attributes
        ]
        yield "\t".join(output)
def convert(genotype_input):
    """Take in Family Tree genotype data, yield GFF formatted lines"""
    genotype_data = genotype_input
    if isinstance(genotype_input, str):
        genotype_data = csv.reader(autozip.file_open(genotype_input, 'r'))
    else:
        genotype_data = csv.reader(genotype_input)

    # Currently Family Tree DNA appears to only be in build 36 format. 
    # There doesn't appear to be any record in the files regarding which 
    # build was used.
    build = "b36"
    yield "##genome-build " + build

    header_row = genotype_data.next()
    col = dict()
    for i in range(len(header_row)):
        col[header_row[i]] = i

    for row in genotype_data:
        variants = list(row[col['RESULT']])
        if variants[0] == '-' or variants[0] == 'I' or variants[0] == 'D':
            continue
        chromosome = 'chr' + row[col['CHROMOSOME']]
        pos_start = row[col['POSITION']]
        pos_end = pos_start

        attributes = ''
        if variants[0] == variants[1]:
            attributes = 'alleles ' + variants[0]
        else:
            attributes = 'alleles ' + variants[0] + '/' + variants[1]
        if re.match('rs', row[col['RSID']]):
            attributes = attributes + '; db_xref dbsnp:' + row[col['RSID']]

        output = [chromosome, "FTDNA", "SNP", pos_start, pos_end, '.', '+', 
                  '.', attributes]
        yield "\t".join(output)
def convert(vcf_input, options=None):
    """Generator that converts CGI var data to GFF-formated strings"""
    # Set up VCF input. Default is to assume a str generator.
    vcf_data = vcf_input
    if isinstance(vcf_input, str):
        vcf_data = autozip.file_open(vcf_input, 'r')

    build = DEFAULT_BUILD
    header_done = False
    saw_chromosome = False
    for line in vcf_data:
        # Handle the header, get the genome build if you can.
        if not header_done:
            if re.match("#", line):
                build = process_header(line, build)
                continue
            else:
                # Output GFF header once we're done reading VCF header.
                yield "##genome-build " + build
                header_done = True
        if re.search("^\W*$", line):
            continue

        if options and options.chromosome:
            data = line.rstrip('\n').split("\t")
            if (data[0] != options.chromosome and
                'chr' + data[0] != options.chromosome):
                if saw_chromosome:
                    # Assume all base calls for a single chromosome
                    # are in a contiguous block.
                    break
                continue
            saw_chromosome = True

        output = process_line(line)

        if output:
            yield output
Exemple #30
0
def process_kgxref(kgxref_file, ucsc_to_name):
    """Find and return one-to-one Uniprot ID / gene name mapping

    Using kgXref and our own gene name mappings, some gene names appear to
    correspond to more than one Uniprot ID (100 in hg18) and some Uniprot IDs
    appear to correspond to more than one gene name (51 in hg18). Because these
    are such a small fraction, we remove them and return the one-to-one mapping
    (18,453 in hg18) as a dict where both are keys (e.g. 36,906 keys for hg18).
    """
    name_to_uniprot = dict()
    uniprot_to_name = dict()
    name_unique = dict()   
    uniprot_unique = dict()
    f_in = autozip.file_open(kgxref_file, 'r')
    for line in f_in:
        data = line.rstrip('\n').split('\t')
        if data[2] and data[0] in ucsc_to_name:
            genename = ucsc_to_name[data[0]]
            uniprotname = data[2]
            if re.match(r'(.*?)-', data[2]):
                uniprotname = re.match(r'(.*?)-', data[2]).group(1)
            if genename in name_to_uniprot:
                if not name_to_uniprot[genename] == uniprotname:
                    name_unique[genename] = False
            else:
                name_to_uniprot[genename] = uniprotname
                name_unique[genename] = True
            if uniprotname in uniprot_to_name:
                if not uniprot_to_name[uniprotname] == genename:
                    uniprot_unique[uniprotname] = False
            else:
                uniprot_to_name[uniprotname] = genename
                uniprot_unique[uniprotname] = True
    final_dict = dict()
    for key in uniprot_to_name:
        if uniprot_unique[key] and name_unique[uniprot_to_name[key]]:
            final_dict[key] = uniprot_to_name[key]
    return final_dict
def process_kgxref(kgxref_file, ucsc_to_name):
    """Find and return one-to-one Uniprot ID / gene name mapping

    Using kgXref and our own gene name mappings, some gene names appear to
    correspond to more than one Uniprot ID (100 in hg18) and some Uniprot IDs
    appear to correspond to more than one gene name (51 in hg18). Because these
    are such a small fraction, we remove them and return the one-to-one mapping
    (18,453 in hg18) as a dict where both are keys (e.g. 36,906 keys for hg18).
    """
    name_to_uniprot = dict()
    uniprot_to_name = dict()
    name_unique = dict()   
    uniprot_unique = dict()
    f_in = autozip.file_open(kgxref_file, 'r')
    for line in f_in:
        data = line.rstrip('\n').split('\t')
        if data[2] and data[0] in ucsc_to_name:
            genename = ucsc_to_name[data[0]]
            uniprotname = data[2]
            if re.match(r'(.*?)-', data[2]):
                uniprotname = re.match(r'(.*?)-', data[2]).group(1)
            if genename in name_to_uniprot:
                if not name_to_uniprot[genename] == uniprotname:
                    name_unique[genename] = False
            else:
                name_to_uniprot[genename] = uniprotname
                name_unique[genename] = True
            if uniprotname in uniprot_to_name:
                if not uniprot_to_name[uniprotname] == genename:
                    uniprot_unique[uniprotname] = False
            else:
                uniprot_to_name[uniprotname] = genename
                uniprot_unique[uniprotname] = True
    final_dict = dict()
    for key in uniprot_to_name:
        if uniprot_unique[key] and name_unique[uniprot_to_name[key]]:
            final_dict[key] = uniprot_to_name[key]
    return final_dict
Exemple #32
0
def convert(vcf_input, options=None):
    """Generator that converts CGI var data to GFF-formated strings"""
    # Set up VCF input. Default is to assume a str generator.
    vcf_data = vcf_input
    if isinstance(vcf_input, str):
        vcf_data = autozip.file_open(vcf_input, 'r')

    build = DEFAULT_BUILD
    header_done = False
    saw_chromosome = False
    for line in vcf_data:
        # Handle the header, get the genome build if you can.
        if not header_done:
            if re.match("#", line):
                build = process_header(line, build)
                continue
            else:
                # Output GFF header once we're done reading VCF header.
                yield "##genome-build " + build
                header_done = True
        if re.search("^\W*$", line):
            continue

        if options and options.chromosome:
            data = line.rstrip('\n').split("\t")
            if (data[0] != options.chromosome
                    and 'chr' + data[0] != options.chromosome):
                if saw_chromosome:
                    # Assume all base calls for a single chromosome
                    # are in a contiguous block.
                    break
                continue
            saw_chromosome = True

        output = process_line(line)

        if output:
            yield output
def load_getev(getev_file):
    """Read GET-Evidence flatfile"""
    getev_variants = dict()
    if isinstance(getev_file, str):
        getev_in = autozip.file_open(getev_file)
    else:
        getev_in = getev_file
    for line in getev_in:
        getev_data = json.loads(line)
        has_variant_id = ('variant_id' in getev_data and
                          getev_data['variant_id'])
        has_aachange = ('gene' in getev_data and getev_data['gene'] and
                        'aa_change_short' in getev_data and
                        getev_data['aa_change_short'])
        has_dbsnp = ('dbsnp_id' in getev_data and getev_data['dbsnp_id'])
        if has_aachange and has_variant_id:
            gene_aachange_key = (getev_data['gene'] + '-' +
                                 getev_data['aa_change_short'])
            getev_variants[gene_aachange_key] = getev_data['variant_id']
        elif has_dbsnp and has_variant_id:
            dbsnp_key = getev_data['dbsnp_id']
            getev_variants[dbsnp_key] = getev_data['variant_id']
    getev_in.close()
    return getev_variants
def load_getev(getev_file):
    """Read GET-Evidence flatfile"""
    getev_variants = dict()
    if isinstance(getev_file, str):
        getev_in = autozip.file_open(getev_file)
    else:
        getev_in = getev_file
    for line in getev_in:
        getev_data = json.loads(line)
        has_variant_id = ('variant_id' in getev_data
                          and getev_data['variant_id'])
        has_aachange = ('gene' in getev_data and getev_data['gene']
                        and 'aa_change_short' in getev_data
                        and getev_data['aa_change_short'])
        has_dbsnp = ('dbsnp_id' in getev_data and getev_data['dbsnp_id'])
        if has_aachange and has_variant_id:
            gene_aachange_key = (getev_data['gene'] + '-' +
                                 getev_data['aa_change_short'])
            getev_variants[gene_aachange_key] = getev_data['variant_id']
        elif has_dbsnp and has_variant_id:
            dbsnp_key = getev_data['dbsnp_id']
            getev_variants[dbsnp_key] = getev_data['variant_id']
    getev_in.close()
    return getev_variants
 def read_metadata(self, genome_id):
     """Open file containing metadata, initializes self.metadata"""
     metadata_path = GENOMEFILE_PRE + genome_id + GENOMEMETA_POST
     f_meta = autozip.file_open(metadata_path)
     self.metadata = json.loads(f_meta.next())
     f_meta.close()
def add_coverage(shasum, coveragefile):
    metadata = read_metadata(shasum)
    if (not metadata or not 'genome_build' in metadata
            or metadata['genome_build'] != 'b36'):
        return coveragefile

    coverage_in = autozip.file_open(coveragefile)
    gff_in = autozip.file_open('/home/trait/upload/' + shasum +
                               '-out/ns.gff.gz')
    covdir, covfile = os.path.split(coveragefile)
    covfile_pre = covfile
    if re.match('(.*)\.gz', covfile):
        covfile_pre = re.match('(.*)\.gz', covfile).groups()[0]
    coverage_out_path = os.path.join(covdir,
                                     covfile_pre + '_' + shasum[0:6] + '.gz')
    coverage_out = autozip.file_open(coverage_out_path, 'w')

    cov_header = coverage_in.next().rstrip().split()
    coverage_out.write(' '.join(cov_header + [shasum]) + '\n')

    gff_lookahead = gff_in.next().split()
    while gff_lookahead and re.match('#', gff_lookahead[0]):
        gff_lookahead = gff_in.next().split()
    gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)

    coverage_currdata = coverage_in.next().split()
    cov_blank = ['0' for x in coverage_currdata[3:]]

    while coverage_currdata or gff_currdata:
        # Skip data that are zero or negative (??) coverage
        if (gff_currdata
                and int(gff_currdata[4]) - (int(gff_currdata[3]) - 1) <= 0):
            gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)
            continue
        if (coverage_currdata and
                int(coverage_currdata[2]) - int(coverage_currdata[1]) <= 0):
            try:
                coverage_currdata = coverage_in.next().split()
            except StopIteration:
                coverage_currdata = None
            continue
        # If coverage file is done, output GFF line
        if not coverage_currdata:
            output = [gff_currdata[0]] + gff_currdata[3:5] + cov_blank + ['1']
            coverage_out.write(' '.join(output) + '\n')
            gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)
            continue
        # If GFF file is done, output coverage file line
        if not gff_currdata:
            output = coverage_currdata + ['0']
            coverage_out.write(' '.join(output) + '\n')
            try:
                coverage_currdata = coverage_in.next().split()
            except StopIteration:
                coverage_currdata = None
            continue
        # If they aren't on the same chromosome, move one of them forward.
        if coverage_currdata[0] != gff_currdata[0]:
            if coverage_currdata[0] < gff_currdata[0]:
                output = coverage_currdata + ['0']
                coverage_out.write(' '.join(output) + '\n')
                try:
                    coverage_currdata = coverage_in.next().split()
                except StopIteration:
                    coverage_currdata = None
            else:
                output = ([gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] +
                          [gff_currdata[4]] + cov_blank + ['1'])
                coverage_out.write(' '.join(output) + '\n')
                gff_currdata, gff_lookahead = move_gff_ahead(
                    gff_in, gff_lookahead)
            continue
        # If we get here, we have both files & both are on the same chrom
        if int(coverage_currdata[1]) < (int(gff_currdata[3]) - 1):
            # Coverage file start is before GFF start.
            if int(coverage_currdata[2]) <= (int(gff_currdata[3]) - 1):
                # Whole coverage file data is before GFF line.
                output = coverage_currdata + ['0']
                coverage_out.write(' '.join(output) + '\n')
                try:
                    coverage_currdata = coverage_in.next().split()
                except StopIteration:
                    coverage_currdata = None
            else:
                # Print uncovered up to the GFF start.
                output = (coverage_currdata[0:2] +
                          [str(int(gff_currdata[3]) - 1)] +
                          coverage_currdata[3:] + ['0'])
                coverage_out.write(' '.join(output) + '\n')
                coverage_currdata[1] = str(int(gff_currdata[3]) - 1)
                if int(coverage_currdata[2]) <= int(coverage_currdata[1]):
                    try:
                        coverage_currdata = coverage_in.next().split()
                    except StopIteration:
                        coverage_currdata = None
        elif int(coverage_currdata[1]) > (int(gff_currdata[3]) - 1):
            # GFF start is before coverage file start.
            if int(coverage_currdata[1]) > int(gff_currdata[4]):
                # Whole GFF file data is before coverage file data
                output = ([gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] +
                          [gff_currdata[4]] + cov_blank + ['1'])
                coverage_out.write(' '.join(output) + '\n')
                gff_currdata, gff_lookahead = move_gff_ahead(
                    gff_in, gff_lookahead)
            else:
                # Print uncovered GFF up to coverage file start
                output = ([gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] +
                          [coverage_currdata[1]] + cov_blank + ['1'])
                coverage_out.write(' '.join(output) + '\n')
                gff_currdata[3] = str(int(coverage_currdata[1]) + 1)
        else:
            # Coverage file and GFF data have same start.
            if int(coverage_currdata[2]) < int(gff_currdata[4]):
                # Coverage file ends first: output, update GFF, advance coverage
                output = coverage_currdata + ['1']
                coverage_out.write(' '.join(output) + '\n')
                gff_currdata[3] = str(int(coverage_currdata[2]) + 1)
                try:
                    coverage_currdata = coverage_in.next().split()
                except StopIteration:
                    coverage_currdata = None
            elif int(coverage_currdata[2]) > int(gff_currdata[4]):
                # GFF ends first: output, update coverage, advance GFF
                output = (coverage_currdata[0:2] + [gff_currdata[4]] +
                          coverage_currdata[3:] + ['1'])
                coverage_out.write(' '.join(output) + '\n')
                coverage_currdata[1] = gff_currdata[4]
                if int(coverage_currdata[2]) <= int(coverage_currdata[1]):
                    try:
                        coverage_currdata = coverage_in.next().split()
                    except StopIteration:
                        coverage_currdata = None
                gff_currdata, gff_lookahead = move_gff_ahead(
                    gff_in, gff_lookahead)
            else:
                # Both end at the same point: Output and advance both.
                output = coverage_currdata + ['1']
                coverage_out.write(' '.join(output) + '\n')
                try:
                    coverage_currdata = coverage_in.next().split()
                except StopIteration:
                    coverage_currdata = None
                gff_currdata, gff_lookahead = move_gff_ahead(
                    gff_in, gff_lookahead)
    coverage_out.close()
    gff_in.close()
    coverage_in.close()
    return coverage_out_path
Exemple #37
0
def genome_analyzer(genotype_file, server=None, options=None):
    """Perform analyses on genotype_file"""
    init_stuff = processing_init(genotype_file, server)
    if init_stuff:
        output_dir, log, log_handle, lockfile, logfile = init_stuff
    else:
        return None

    # Set up arguments used by processing commands and scripts.
    args = {
        'genotype_input': str(genotype_file),
        'miss_out': os.path.join(output_dir, 'missing_coding.json'),
        'sorted_out': os.path.join(output_dir, 'source_sorted.gff.gz'),
        'nonsyn_out_tmp': os.path.join(output_dir, 'ns_tmp.gff.gz'),
        'nonsyn_out': os.path.join(output_dir, 'ns.gff.gz'),
        'getev_out': os.path.join(output_dir, 'get-evidence.json'),
        'getev_genes_out': os.path.join(output_dir, 'get-ev_genes.json'),
        'metadata_out': os.path.join(output_dir, 'metadata.json'),
        'genome_stats': os.path.join(os.getenv('DATA'), 'genome_stats.txt'),
        'genetests': os.path.join(os.getenv('DATA'), GENETESTS_DATA),
        'getev_flat': os.path.join(os.getenv('DATA'), GETEV_FLAT)
    }

    # Make output directory if needed
    try:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    except:
        print "Unexpected error:", sys.exc_info()[0]

    # Read metadata with uploaded file, if available.
    try:
        f_metadata = autozip.file_open(
            os.path.dirname(genotype_file) + '/metadata.json')
        metadata_line = f_metadata.next()
        genome_data = json.loads(metadata_line)
    except IOError:
        genome_data = dict()

    # Process and sort input genome data
    log.put('#status 0/100 converting and sorting input file')
    gff_in_gen = None
    # Look for parents and, if possible, use these to phase genome.
    if ('parent A' in genome_data and 'parent B' in genome_data):
        parA_in_dir = os.path.join(
            os.path.dirname(os.path.dirname(args['genotype_input'])),
            genome_data['parent A'])
        parB_in_dir = os.path.join(
            os.path.dirname(os.path.dirname(args['genotype_input'])),
            genome_data['parent B'])
        if os.path.exists(parA_in_dir) and os.path.exists(parB_in_dir):
            parA_files = os.listdir(parA_in_dir)
            parA_file_match = [
                x for x in parA_files if re.match('genotype', x)
            ]
            parB_files = os.listdir(parB_in_dir)
            parB_file_match = [
                x for x in parB_files if re.match('genotype', x)
            ]
            if parA_file_match and parB_file_match:
                parA_input = os.path.join(parA_in_dir, parA_file_match[0])
                parB_input = os.path.join(parB_in_dir, parB_file_match[0])
                gff_parA_gen = process_source(parA_input,
                                              dict(),
                                              options=options)
                gff_parB_gen = process_source(parB_input,
                                              dict(),
                                              options=options)
                gff_child_gen = process_source(args['genotype_input'],
                                               genome_data,
                                               options=options)
                parA_build = gff_parA_gen.next()
                parB_build = gff_parB_gen.next()
                genome_data['genome_build'] = gff_child_gen.next()
                if (parA_build == genome_data['genome_build']
                        and parB_build == genome_data['genome_build']):
                    trio_phase = gff_trio_phase.PhaseTrio(
                        gff_child_gen, gff_parA_gen, gff_parB_gen, False)
                    gff_in_gen = trio_phase.call_phase()
    # Set up if trio phasing couldn't be done.
    if not gff_in_gen:
        # We pass build as a yield (instead of in metadata) to force the
        # generator to read through the header portion of the input data.
        gff_in_gen = process_source(args['genotype_input'],
                                    genome_data,
                                    options=options)
        genome_data['genome_build'] = gff_in_gen.next()

    # Set up build-dependent file locations
    if (genome_data['genome_build'] == "b36"):
        args['dbsnp'] = os.path.join(os.getenv('DATA'), DBSNP_B36_SORTED)
        args['reference'] = os.path.join(os.getenv('DATA'),
                                         REFERENCE_GENOME_HG18)
        args['transcripts'] = os.path.join(os.getenv('DATA'),
                                           KNOWNGENE_HG18_SORTED)
    elif (genome_data['genome_build'] == "b37"):
        args['dbsnp'] = os.path.join(os.getenv('DATA'), DBSNP_B37_SORTED)
        args['reference'] = os.path.join(os.getenv('DATA'),
                                         REFERENCE_GENOME_HG19)
        args['transcripts'] = os.path.join(os.getenv('DATA'),
                                           KNOWNGENE_HG19_SORTED)
    else:
        raise Exception("genome build data is invalid")

    if options and options.chromosome:
        chrlist = [options.chromosome]
    else:
        # It might be more elegant to extract this from metadata.
        chrlist = ['chr' + str(x) for x in range(1, 22) + ['X', 'Y']]

    # Process genome through a series of GFF-formatted string generators.
    log.put('#status 20 looking up reference alleles and '
            'dbSNP IDs, computing nonsynonymous changes, '
            'cross-referencing GET-Evidence database')
    progtrack = ProgressTracker(sys.stderr, [22, 99],
                                expected=chrlist,
                                metadata=genome_data)

    if not options or not options.no_metadata:

        # Record chromosomes seen and genome coverage.
        gff_in_gen = get_metadata.genome_metadata(gff_in_gen,
                                                  args['genome_stats'],
                                                  progresstracker=progtrack)

        # Report coding regions that lack coverage.
        gff_in_gen = call_missing.report_uncovered(
            gff_in_gen,
            args['transcripts'],
            args['genetests'],
            output_file=args['miss_out'],
            progresstracker=progtrack)

    if options and options.metadata_only:
        for line in gff_in_gen:
            pass

    else:
        # Find reference allele.
        gff_in_gen = gff_twobit_query.match2ref(gff_in_gen, args['reference'])

        # Look up dbSNP IDs
        gff_in_gen = gff_dbsnp_query.match2dbSNP(gff_in_gen, args['dbsnp'])

        # Check for nonsynonymous SNP
        gff_in_gen = gff_nonsynonymous_filter.predict_nonsynonymous(
            gff_in_gen, args['reference'], args['transcripts'])

        # Pull off GET-Evidence hits
        gff_in_gen = gff_getevidence_map.match_getev(
            gff_in_gen,
            args['getev_flat'],
            transcripts_file=args['transcripts'],
            gene_out_file=args['getev_genes_out'] + ".tmp",
            output_file=args['getev_out'] + ".tmp",
            progresstracker=progtrack)

        # Printing to output, pulls data through the generator chain.
        ns_out = autozip.file_open(args['nonsyn_out_tmp'], 'w')
        for line in gff_in_gen:
            ns_out.write(line + "\n")
        ns_out.close()

        os.system("mv " + args['getev_out'] + ".tmp " + args['getev_out'])
        os.system("mv " + args['nonsyn_out_tmp'] + " " + args['nonsyn_out'])
        os.system("mv " + args['getev_genes_out'] + ".tmp " +
                  args['getev_genes_out'])

    # Print metadata
    metadata_f_out = open(args['metadata_out'], 'w')
    progtrack.write_metadata(metadata_f_out)
    metadata_f_out.close()

    log.put('#status 100 finished')

    os.rename(lockfile, logfile)
    log_handle.close()
    print "Finished processing file " + str(genotype_file)
def add_coverage(shasum, coveragefile):
    metadata = read_metadata(shasum)
    if not metadata or not "genome_build" in metadata or metadata["genome_build"] != "b36":
        return coveragefile

    coverage_in = autozip.file_open(coveragefile)
    gff_in = autozip.file_open("/home/trait/upload/" + shasum + "-out/ns.gff.gz")
    covdir, covfile = os.path.split(coveragefile)
    covfile_pre = covfile
    if re.match("(.*)\.gz", covfile):
        covfile_pre = re.match("(.*)\.gz", covfile).groups()[0]
    coverage_out_path = os.path.join(covdir, covfile_pre + "_" + shasum[0:6] + ".gz")
    coverage_out = autozip.file_open(coverage_out_path, "w")

    cov_header = coverage_in.next().rstrip().split()
    coverage_out.write(" ".join(cov_header + [shasum]) + "\n")

    gff_lookahead = gff_in.next().split()
    while gff_lookahead and re.match("#", gff_lookahead[0]):
        gff_lookahead = gff_in.next().split()
    gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)

    coverage_currdata = coverage_in.next().split()
    cov_blank = ["0" for x in coverage_currdata[3:]]

    while coverage_currdata or gff_currdata:
        # Skip data that are zero or negative (??) coverage
        if gff_currdata and int(gff_currdata[4]) - (int(gff_currdata[3]) - 1) <= 0:
            gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)
            continue
        if coverage_currdata and int(coverage_currdata[2]) - int(coverage_currdata[1]) <= 0:
            try:
                coverage_currdata = coverage_in.next().split()
            except StopIteration:
                coverage_currdata = None
            continue
        # If coverage file is done, output GFF line
        if not coverage_currdata:
            output = [gff_currdata[0]] + gff_currdata[3:5] + cov_blank + ["1"]
            coverage_out.write(" ".join(output) + "\n")
            gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)
            continue
        # If GFF file is done, output coverage file line
        if not gff_currdata:
            output = coverage_currdata + ["0"]
            coverage_out.write(" ".join(output) + "\n")
            try:
                coverage_currdata = coverage_in.next().split()
            except StopIteration:
                coverage_currdata = None
            continue
        # If they aren't on the same chromosome, move one of them forward.
        if coverage_currdata[0] != gff_currdata[0]:
            if coverage_currdata[0] < gff_currdata[0]:
                output = coverage_currdata + ["0"]
                coverage_out.write(" ".join(output) + "\n")
                try:
                    coverage_currdata = coverage_in.next().split()
                except StopIteration:
                    coverage_currdata = None
            else:
                output = [gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] + [gff_currdata[4]] + cov_blank + ["1"]
                coverage_out.write(" ".join(output) + "\n")
                gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)
            continue
        # If we get here, we have both files & both are on the same chrom
        if int(coverage_currdata[1]) < (int(gff_currdata[3]) - 1):
            # Coverage file start is before GFF start.
            if int(coverage_currdata[2]) <= (int(gff_currdata[3]) - 1):
                # Whole coverage file data is before GFF line.
                output = coverage_currdata + ["0"]
                coverage_out.write(" ".join(output) + "\n")
                try:
                    coverage_currdata = coverage_in.next().split()
                except StopIteration:
                    coverage_currdata = None
            else:
                # Print uncovered up to the GFF start.
                output = coverage_currdata[0:2] + [str(int(gff_currdata[3]) - 1)] + coverage_currdata[3:] + ["0"]
                coverage_out.write(" ".join(output) + "\n")
                coverage_currdata[1] = str(int(gff_currdata[3]) - 1)
                if int(coverage_currdata[2]) <= int(coverage_currdata[1]):
                    try:
                        coverage_currdata = coverage_in.next().split()
                    except StopIteration:
                        coverage_currdata = None
        elif int(coverage_currdata[1]) > (int(gff_currdata[3]) - 1):
            # GFF start is before coverage file start.
            if int(coverage_currdata[1]) > int(gff_currdata[4]):
                # Whole GFF file data is before coverage file data
                output = [gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] + [gff_currdata[4]] + cov_blank + ["1"]
                coverage_out.write(" ".join(output) + "\n")
                gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)
            else:
                # Print uncovered GFF up to coverage file start
                output = (
                    [gff_currdata[0]] + [str(int(gff_currdata[3]) - 1)] + [coverage_currdata[1]] + cov_blank + ["1"]
                )
                coverage_out.write(" ".join(output) + "\n")
                gff_currdata[3] = str(int(coverage_currdata[1]) + 1)
        else:
            # Coverage file and GFF data have same start.
            if int(coverage_currdata[2]) < int(gff_currdata[4]):
                # Coverage file ends first: output, update GFF, advance coverage
                output = coverage_currdata + ["1"]
                coverage_out.write(" ".join(output) + "\n")
                gff_currdata[3] = str(int(coverage_currdata[2]) + 1)
                try:
                    coverage_currdata = coverage_in.next().split()
                except StopIteration:
                    coverage_currdata = None
            elif int(coverage_currdata[2]) > int(gff_currdata[4]):
                # GFF ends first: output, update coverage, advance GFF
                output = coverage_currdata[0:2] + [gff_currdata[4]] + coverage_currdata[3:] + ["1"]
                coverage_out.write(" ".join(output) + "\n")
                coverage_currdata[1] = gff_currdata[4]
                if int(coverage_currdata[2]) <= int(coverage_currdata[1]):
                    try:
                        coverage_currdata = coverage_in.next().split()
                    except StopIteration:
                        coverage_currdata = None
                gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)
            else:
                # Both end at the same point: Output and advance both.
                output = coverage_currdata + ["1"]
                coverage_out.write(" ".join(output) + "\n")
                try:
                    coverage_currdata = coverage_in.next().split()
                except StopIteration:
                    coverage_currdata = None
                gff_currdata, gff_lookahead = move_gff_ahead(gff_in, gff_lookahead)
    coverage_out.close()
    gff_in.close()
    coverage_in.close()
    return coverage_out_path
def genome_analyzer(genotype_file, server=None, options=None):
    """Perform analyses on genotype_file"""
    init_stuff = processing_init(genotype_file, server)
    if init_stuff:
        output_dir, log, log_handle, lockfile, logfile = init_stuff
    else:
        return None

    # Set up arguments used by processing commands and scripts.
    args = {
        "genotype_input": str(genotype_file),
        "miss_out": os.path.join(output_dir, "missing_coding.json"),
        "sorted_out": os.path.join(output_dir, "source_sorted.gff.gz"),
        "nonsyn_out_tmp": os.path.join(output_dir, "ns_tmp.gff.gz"),
        "nonsyn_out": os.path.join(output_dir, "ns.gff.gz"),
        "getev_out": os.path.join(output_dir, "get-evidence.json"),
        "getev_genes_out": os.path.join(output_dir, "get-ev_genes.json"),
        "metadata_out": os.path.join(output_dir, "metadata.json"),
        "genome_stats": os.path.join(os.getenv("DATA"), "genome_stats.txt"),
        "genetests": os.path.join(os.getenv("DATA"), GENETESTS_DATA),
        "getev_flat": os.path.join(os.getenv("DATA"), GETEV_FLAT),
    }

    # Make output directory if needed
    try:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    except:
        print "Unexpected error:", sys.exc_info()[0]

    # Read metadata with uploaded file, if available.
    try:
        f_metadata = autozip.file_open(os.path.dirname(genotype_file) + "/metadata.json")
        metadata_line = f_metadata.next()
        genome_data = json.loads(metadata_line)
    except IOError:
        genome_data = dict()

    # Process and sort input genome data
    log.put("#status 0/100 converting and sorting input file")
    gff_in_gen = None
    # Look for parents and, if possible, use these to phase genome.
    if "parent A" in genome_data and "parent B" in genome_data:
        parA_in_dir = os.path.join(os.path.dirname(os.path.dirname(args["genotype_input"])), genome_data["parent A"])
        parB_in_dir = os.path.join(os.path.dirname(os.path.dirname(args["genotype_input"])), genome_data["parent B"])
        if os.path.exists(parA_in_dir) and os.path.exists(parB_in_dir):
            parA_files = os.listdir(parA_in_dir)
            parA_file_match = [x for x in parA_files if re.match("genotype", x)]
            parB_files = os.listdir(parB_in_dir)
            parB_file_match = [x for x in parB_files if re.match("genotype", x)]
            if parA_file_match and parB_file_match:
                parA_input = os.path.join(parA_in_dir, parA_file_match[0])
                parB_input = os.path.join(parB_in_dir, parB_file_match[0])
                gff_parA_gen = process_source(parA_input, dict(), options=options)
                gff_parB_gen = process_source(parB_input, dict(), options=options)
                gff_child_gen = process_source(args["genotype_input"], genome_data, options=options)
                parA_build = gff_parA_gen.next()
                parB_build = gff_parB_gen.next()
                genome_data["genome_build"] = gff_child_gen.next()
                if parA_build == genome_data["genome_build"] and parB_build == genome_data["genome_build"]:
                    trio_phase = gff_trio_phase.PhaseTrio(gff_child_gen, gff_parA_gen, gff_parB_gen, False)
                    gff_in_gen = trio_phase.call_phase()
    # Set up if trio phasing couldn't be done.
    if not gff_in_gen:
        # We pass build as a yield (instead of in metadata) to force the
        # generator to read through the header portion of the input data.
        gff_in_gen = process_source(args["genotype_input"], genome_data, options=options)
        genome_data["genome_build"] = gff_in_gen.next()

    # Set up build-dependent file locations
    if genome_data["genome_build"] == "b36":
        args["dbsnp"] = os.path.join(os.getenv("DATA"), DBSNP_B36_SORTED)
        args["reference"] = os.path.join(os.getenv("DATA"), REFERENCE_GENOME_HG18)
        args["transcripts"] = os.path.join(os.getenv("DATA"), KNOWNGENE_HG18_SORTED)
    elif genome_data["genome_build"] == "b37":
        args["dbsnp"] = os.path.join(os.getenv("DATA"), DBSNP_B37_SORTED)
        args["reference"] = os.path.join(os.getenv("DATA"), REFERENCE_GENOME_HG19)
        args["transcripts"] = os.path.join(os.getenv("DATA"), KNOWNGENE_HG19_SORTED)
    else:
        raise Exception("genome build data is invalid")

    if options and options.chromosome:
        chrlist = [options.chromosome]
    else:
        # It might be more elegant to extract this from metadata.
        chrlist = ["chr" + str(x) for x in range(1, 22) + ["X", "Y"]]

    # Process genome through a series of GFF-formatted string generators.
    log.put(
        "#status 20 looking up reference alleles and "
        "dbSNP IDs, computing nonsynonymous changes, "
        "cross-referencing GET-Evidence database"
    )
    progtrack = ProgressTracker(sys.stderr, [22, 99], expected=chrlist, metadata=genome_data)

    if not options or not options.chromosome:

        # Record chromosomes seen and genome coverage.
        gff_in_gen = get_metadata.genome_metadata(gff_in_gen, args["genome_stats"], progresstracker=progtrack)

        # Report coding regions that lack coverage.
        gff_in_gen = call_missing.report_uncovered(
            gff_in_gen, args["transcripts"], args["genetests"], output_file=args["miss_out"], progresstracker=progtrack
        )

    if options and options.metadata_only:
        for line in gff_in_gen:
            pass

    else:
        # Find reference allele.
        gff_in_gen = gff_twobit_query.match2ref(gff_in_gen, args["reference"])

        # Look up dbSNP IDs
        gff_in_gen = gff_dbsnp_query.match2dbSNP(gff_in_gen, args["dbsnp"])

        # Check for nonsynonymous SNP
        gff_in_gen = gff_nonsynonymous_filter.predict_nonsynonymous(gff_in_gen, args["reference"], args["transcripts"])

        # Pull off GET-Evidence hits
        gff_in_gen = gff_getevidence_map.match_getev(
            gff_in_gen,
            args["getev_flat"],
            transcripts_file=args["transcripts"],
            gene_out_file=args["getev_genes_out"] + ".tmp",
            output_file=args["getev_out"] + ".tmp",
            progresstracker=progtrack,
        )

        # Printing to output, pulls data through the generator chain.
        ns_out = autozip.file_open(args["nonsyn_out_tmp"], "w")
        for line in gff_in_gen:
            ns_out.write(line + "\n")
        ns_out.close()

        os.system("mv " + args["getev_out"] + ".tmp " + args["getev_out"])
        os.system("mv " + args["nonsyn_out_tmp"] + " " + args["nonsyn_out"])
        os.system("mv " + args["getev_genes_out"] + ".tmp " + args["getev_genes_out"])

    # Print metadata
    metadata_f_out = open(args["metadata_out"], "w")
    progtrack.write_metadata(metadata_f_out)
    metadata_f_out.close()

    log.put("#status 100 finished")

    os.rename(lockfile, logfile)
    log_handle.close()
    print "Finished processing file " + str(genotype_file)
def get_allele_freqs(password,
                     getev_file,
                     excluded=None,
                     chromfile=None,
                     outputfile=None):
    # Set up output, genome inputs, GET-Evidence variants, and twobit reference.
    if outputfile:
        print "Setting up output file"
        f_out = autozip.file_open(outputfile, 'w')
    else:
        f_out = None
    genome_ids = get_genome_list(password, excluded)
    if chromfile:
        if f_out:
            print "Getting chromosomes..."
        chroms = read_single_items(chromfile)
    else:
        chroms = None
    if f_out:
        print "Reading GET-Ev flat file (takes a couple minutes)..."
    getev_variants = load_getev(getev_file)
    if f_out:
        print "Loading twobit genome..."
    twobit_genome = twobit.input(TWOBIT_PATH)
    if f_out:
        print(
            "Setting up GenomeSet (may be slow if each genome has to advance "
            + "to target chromosomes)...")
        genome_set = GenomeSet(genome_ids,
                               chroms=chroms,
                               getev_vars=getev_variants,
                               verbose=True)
    else:
        genome_set = GenomeSet(genome_ids,
                               chroms=chroms,
                               getev_vars=getev_variants)
    if f_out:
        print "Find earliest ends"
    earliest_ends = genome_set.earliest_ends()
    #print earliest_ends

    # Move through the genomes to find allele frequencies
    while genome_set.genomes:
        # Move ahead of all "earliest ends" & save new earliest.
        next_earliest = genome_set.advance_all_past_end_pos(earliest_ends[0])

        # Check all old "earliest ends" positions for interesting variants.
        has_var = []
        is_interesting = False
        for position in earliest_ends:
            #print position
            if not position['ref']:
                has_var.append(position)
                #is_interesting = True
                if 'amino_acid' in position or 'getev_id' in position:
                    is_interesting = True

        #if is_interesting:
        #    print "Earliest ends: " + str(earliest_ends)
        #    print [(x.id, x.data[-1]) for x in genome_set.genomes]
        #    if has_var:
        #        print "Var pos: " + str(has_var)

        # If there are interesting variants, calculate allele frequency.
        if has_var and is_interesting:
            # Check if another genomes has an overlapping variant extending
            # beyond this position, we're not ready to evaluate this yet
            # (it will be caught when the later overlapping one comes up).
            if genome_set.no_later_var(has_var):
                freqout = genome_set.eval_var_freq(has_var, twobit_genome)
                if f_out:
                    f_out.write(freqout + '\n')
                else:
                    print freqout

        genome_set.clean_out_prior_pos(earliest_ends)

        # Reset "earliest end" to next earliest positions.
        earliest_ends = next_earliest
def get_allele_freqs(password, getev_file, excluded=None, chromfile=None, 
                     outputfile=None):
    # Set up output, genome inputs, GET-Evidence variants, and twobit reference.
    if outputfile:
        print "Setting up output file"
        f_out = autozip.file_open(outputfile, 'w')
    else:
        f_out = None
    genome_ids = get_genome_list(password, excluded)
    if chromfile:
        if f_out:
            print "Getting chromosomes..."
        chroms = read_single_items(chromfile)
    else:
        chroms = None
    if f_out:
        print "Reading GET-Ev flat file (takes a couple minutes)..."
    getev_variants = load_getev(getev_file)
    if f_out:
        print "Loading twobit genome..."
    twobit_genome = twobit.input(TWOBIT_PATH)
    if f_out:
        print("Setting up GenomeSet (may be slow if each genome has to advance " +
              "to target chromosomes)...")
        genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants,
                               verbose=True)
    else:
        genome_set = GenomeSet(genome_ids, chroms=chroms, getev_vars=getev_variants)
    if f_out:
        print "Find earliest ends"
    earliest_ends = genome_set.earliest_ends() 
    #print earliest_ends

    # Move through the genomes to find allele frequencies
    while genome_set.genomes:
        # Move ahead of all "earliest ends" & save new earliest.
        next_earliest = genome_set.advance_all_past_end_pos(earliest_ends[0])

        # Check all old "earliest ends" positions for interesting variants.
        has_var = []
        is_interesting = False
        for position in earliest_ends:
            #print position
            if not position['ref']:
                has_var.append(position)
                #is_interesting = True
                if 'amino_acid' in position or 'getev_id' in position:
                    is_interesting = True

        #if is_interesting:
        #    print "Earliest ends: " + str(earliest_ends)
        #    print [(x.id, x.data[-1]) for x in genome_set.genomes]
        #    if has_var:
        #        print "Var pos: " + str(has_var)

        # If there are interesting variants, calculate allele frequency.
        if has_var and is_interesting:
            # Check if another genomes has an overlapping variant extending 
            # beyond this position, we're not ready to evaluate this yet 
            # (it will be caught when the later overlapping one comes up).
            if genome_set.no_later_var(has_var):
                freqout = genome_set.eval_var_freq(has_var, twobit_genome)
                if f_out:
                    f_out.write(freqout + '\n')
                else:
                    print freqout

        genome_set.clean_out_prior_pos(earliest_ends)

        # Reset "earliest end" to next earliest positions.
        earliest_ends = next_earliest
def genome_analyzer(genotype_file, server=None, options=None):
    """Perform analyses on genotype_file"""
    global config

    init_stuff = processing_init(genotype_file, server)
    if init_stuff:
        output_dir, log, log_handle, lockfile, logfile = init_stuff
    else:
        return None

    # override default output directory
    if options and options.output_dir:
      output_dir = options.output_dir

      try:
        os.makedirs( output_dir, mode=0o777 )
      except:
        pass

    # Set up arguments used by processing commands and scripts.
    args = { 'genotype_input': str(genotype_file),
             'miss_out': os.path.join(output_dir, 'missing_coding.json'),
             'sorted_out': os.path.join(output_dir, 'source_sorted.gff.gz'),
             'nonsyn_out_tmp': os.path.join(output_dir, 'ns_tmp.gff.gz'),
             'nonsyn_out': os.path.join(output_dir, 'ns.gff.gz'),
             'getev_out': os.path.join(output_dir, 'get-evidence.json'),
             'getev_genes_out': os.path.join(output_dir, 'get-ev_genes.json'),
             'metadata_out': os.path.join(output_dir, 'metadata.json'),
             'genome_stats': config['genome_stats'] ,
             'genetests': config['GENETESTS_DATA'],
             'getev_flat': config['GETEV_FLAT']
             }

    # Make output directory if needed
    try:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    except:
        print "Unexpected error:", sys.exc_info()[0]

    # Read metadata with uploaded file, if available.
    try:
        f_metadata = autozip.file_open(os.path.dirname(genotype_file) +
                                       '/metadata.json')
        metadata_line = f_metadata.next()
        genome_data = json.loads(metadata_line)
    except IOError:
        genome_data = dict()

    # Process and sort input genome data
    log.put ('#status 0/100 converting and sorting input file')
    gff_in_gen = None
    # Look for parents and, if possible, use these to phase genome.
    if ('parent A' in genome_data and 'parent B' in genome_data):
        parA_in_dir = os.path.join(
            os.path.dirname(os.path.dirname(args['genotype_input'])),
            genome_data['parent A'])
        parB_in_dir = os.path.join(
            os.path.dirname(os.path.dirname(args['genotype_input'])),
            genome_data['parent B'])
        if os.path.exists(parA_in_dir) and os.path.exists(parB_in_dir):
            parA_files = os.listdir(parA_in_dir)
            parA_file_match = [x for x in parA_files if re.match('genotype', x)]
            parB_files = os.listdir(parB_in_dir)
            parB_file_match = [x for x in parB_files if re.match('genotype', x)]
            if parA_file_match and parB_file_match:
                parA_input = os.path.join(parA_in_dir, parA_file_match[0])
                parB_input = os.path.join(parB_in_dir, parB_file_match[0])
                gff_parA_gen = process_source(parA_input, dict(), options=options)
                gff_parB_gen = process_source(parB_input, dict(), options=options)
                gff_child_gen = process_source(args['genotype_input'],
                                               genome_data, options=options)
                parA_build = gff_parA_gen.next()
                parB_build = gff_parB_gen.next()
                genome_data['genome_build'] = gff_child_gen.next()
                if (parA_build == genome_data['genome_build'] and
                    parB_build == genome_data['genome_build']):
                    trio_phase = gff_trio_phase.PhaseTrio(gff_child_gen,
                                                          gff_parA_gen,
                                                          gff_parB_gen, False)
                    gff_in_gen = trio_phase.call_phase()
    # Set up if trio phasing couldn't be done.
    if not gff_in_gen:
        # We pass build as a yield (instead of in metadata) to force the
        # generator to read through the header portion of the input data.
        gff_in_gen = process_source(args['genotype_input'], genome_data, options=options)

        genome_data['genome_build'] = gff_in_gen.next()

    # Set up build-dependent file locations
    if (genome_data['genome_build'] == "b36"):
        args['dbsnp'] = config["DBSNP_B36_SORTED"]
        args['reference'] = config["REFERENCE_GENOME_HG18"]
        args['transcripts'] = config["KNOWNGENE_HG18_SORTED"]

    elif (genome_data['genome_build'] == "b37"):
        args['dbsnp'] = config["DBSNP_B37_SORTED"]
        args['reference'] = config["REFERENCE_GENOME_HG19"]
        args['transcripts'] = config["KNOWNGENE_HG19_SORTED"]

    else:
        raise Exception("genome build data is invalid")


    if options and options.chromosome:
        chrlist = [options.chromosome]
    else:
        # It might be more elegant to extract this from metadata.
        chrlist = ['chr' + str(x) for x in range(1, 22) + ['X', 'Y']]

    # Process genome through a series of GFF-formatted string generators.
    log.put('#status 20 looking up reference alleles and '
            'dbSNP IDs, computing nonsynonymous changes, '
            'cross-referencing GET-Evidence database')
    progtrack = ProgressTracker(sys.stderr, [22, 99], expected=chrlist,
                                metadata=genome_data)


    if not options or not options.no_metadata:

        # Record chromosomes seen and genome coverage.
        gff_in_gen = get_metadata.genome_metadata(gff_in_gen,
                                                  args['genome_stats'],
                                                  progresstracker=progtrack)

        # Report coding regions that lack coverage.
        gff_in_gen = call_missing.report_uncovered(gff_in_gen,
                                                   args['transcripts'],
                                                   args['genetests'],
                                                   output_file=args['miss_out'],
                                                   progresstracker=progtrack)

    if options and options.metadata_only:
        for line in gff_in_gen:
            pass

    else:

        # Find reference allele.
        gff_in_gen = gff_twobit_query.match2ref(gff_in_gen, args['reference'])

        # Look up dbSNP IDs
        gff_in_gen = gff_dbsnp_query.match2dbSNP(gff_in_gen, args['dbsnp'])

        # Check for nonsynonymous SNP
        gff_in_gen = gff_nonsynonymous_filter.predict_nonsynonymous(gff_in_gen,
                                                                    args['reference'],
                                                                    args['transcripts'] )

        # Pull off GET-Evidence hits
        gff_in_gen = gff_getevidence_map.match_getev(gff_in_gen,
                                                     args['getev_flat'],
                                                     transcripts_file=args['transcripts'],
                                                     gene_out_file=args['getev_genes_out'] + ".tmp",
                                                     output_file=args['getev_out'] + ".tmp",
                                                     progresstracker=progtrack,
                                                     genetests_filepath=config['GENETESTS_DATA'],
                                                     blosum100_file=config['BLOSUM100'] )

        # Printing to output, pulls data through the generator chain.
        ns_out = autozip.file_open(args['nonsyn_out_tmp'], 'w')
        for line in gff_in_gen:
            ns_out.write(line + "\n")
        ns_out.close()

        os.system("mv " + args['getev_out'] + ".tmp " + args['getev_out'])
        os.system("mv " + args['nonsyn_out_tmp'] + " " + args['nonsyn_out'])
        os.system("mv " + args['getev_genes_out'] + ".tmp " + args['getev_genes_out'])


    # Print metadata
    metadata_f_out = open(args['metadata_out'], 'w')
    progtrack.write_metadata(metadata_f_out)
    metadata_f_out.close()

    log.put ('#status 100 finished')

    os.rename(lockfile, logfile)
    log_handle.close()
    print "Finished processing file " + str(genotype_file)
 def open_genome_file(self, genome_id):
     """Open file containing sequence data, initializes self.f_in"""
     genome_file_path = GENOMEFILE_PRE + genome_id + GENOMEDATA_POST
     self.f_in = autozip.file_open(genome_file_path)
 def read_metadata(self, genome_id):
     """Open file containing metadata, initializes self.metadata"""
     metadata_path = GENOMEFILE_PRE + genome_id + GENOMEMETA_POST
     f_meta = autozip.file_open(metadata_path)
     self.metadata = json.loads(f_meta.next())
     f_meta.close()
 def open_genome_file(self, genome_id):
     """Open file containing sequence data, initializes self.f_in"""
     genome_file_path = GENOMEFILE_PRE + genome_id + GENOMEDATA_POST
     self.f_in = autozip.file_open(genome_file_path)
Exemple #46
0
def detect_format(file_input):
    """Detect the genetic data format of a file.

    Takes a path to a file, or a string generator (e.g. a filehandle).
    Tries to match one of the following:
      23ANDME: 23andme (microarray genotyping)
      CGIVAR: Complete Genomics var file
      deCODEme: deCODEme (microarray genotyping)
      GFF: General Feature Format
      VCF: Variant Call Format (only tested for 23andme exome data)
      ANCESTRY: Ancestry (genotyping data)
    """
    looks_like = dict()
    if isinstance(file_input, str):
        try:
            f_in = autozip.file_open(file_input, 'r')
        except AssertionError:
            f_in = autozip.file_open(file_input, 'r', 'deCODEme_scan.csv')
            if VERBOSE: print "deCODEme archive (deCODEme) detected"
            looks_like['deCODEme'] = True
    else:
        f_in = file_input

    line_count = 0
    for line in f_in:
        line_count += 1
        if any([looks_like[x] for x in looks_like.keys()]):
            break
        if line_count > MAX_LINES_CHECKED:
            break

        # Check comment lines, if they exist, for information on file type.
        if re.match('#', line):
            if re.match(r'#TYPE.*VAR-ANNOTATION', line):
                if VERBOSE:
                    print "Complete Genomics var file format (CGIVAR) detected"
                looks_like['CGIVAR'] = True
            if re.match(r'##gff-version', line):
                if VERBOSE: print "General Feature Format (GFF) detected"
                looks_like['GFF'] = True
            if re.match(r'# This data file generated by 23andMe', line):
                if VERBOSE:
                    print "23andme microarray genotyping data (23ANDME) detected"
                looks_like['23ANDME'] = True
            if re.match(r'##fileformat=VCFv4', line):
                if VERBOSE: print "Variant Call Format (VCF) detected"
                looks_like['VCF'] = True
            if re.match(r'#\s*AncestryDNA', line):
                if VERBOSE:
                    print "Ancestry genotyping data (ANCESTRY) detected"

        # Look at other lines and decide based on their format.
        tsv_data = line.split('\t')
        csv_data = list(csv.reader([line]))[0]

        if (len(csv_data) > 5 and re.match(r'rs', csv_data[0])
                and re.match(r'[ACGT]', csv_data[1])
                and re.match(r'[0-9]', csv_data[3])
                and re.match(r'[+-]', csv_data[4])
                and re.match(r'[ACGT]', csv_data[5])):
            if VERBOSE:
                print "deCODEme microarray genotyping data (deCODEme) guessed"
            looks_like['deCODEme'] = True
        if (len(csv_data) > 3 and re.match(r'rs', csv_data[0])
                and re.match(r'[0-9]', csv_data[2])
                and re.match(r'[ACGT]', csv_data[3])):
            if VERBOSE: print "Family Tree DNA genotyping data (FTDNA) guessed"
            looks_like['FTDNA'] = True
        if (len(tsv_data) > 3 and re.match(r'rs', tsv_data[0])
                and re.match(r'[0-9]', tsv_data[2])
                and re.match(r'[ACGT][ACGT]', tsv_data[3])):
            if VERBOSE:
                print "23andme microarray genotyping data (23ANDME) guessed"
            looks_like['23ANDME'] = True
        if (len(tsv_data) > 4 and re.match(r'rs', tsv_data[0])
                and re.match(r'[0-9]', tsv_data[2])
                and re.match(r'[ACGT0-9]', tsv_data[3])
                and re.match(r'[ACGT0-9]', tsv_data[4])):
            if VERBOSE: print "Ancestry genotyping data (ANCESTRY) guessed"
            looks_like['ANCESTRY'] = True
        if (len(tsv_data) > 6 and re.match(r'chr', tsv_data[3])
                and re.match(r'[0-9]', tsv_data[4])
                and re.match(r'[0-9]', tsv_data[5])
                and (tsv_data[6] == "no-call" or tsv_data[6] == "ref")):
            if VERBOSE:
                print "Complete Genomics var file format (CGIvar) guessed"
            looks_like['CGIVAR'] = True
        if (len(tsv_data) > 6 and re.match(r'[0-9]', tsv_data[3])
                and re.match(r'[0-9]', tsv_data[4]) and tsv_data[6] == "+"):
            if VERBOSE: print "General Feature Format (GFF) guessed"
            looks_like['GFF'] = True
        if (len(tsv_data) > 7 and re.match(r'[0-9]', tsv_data[1])
                and re.match(r'[ACGT]', tsv_data[3])
                and re.match(r'[ACGT]', tsv_data[4])
                and len(tsv_data[7].split(';')) > 2):
            if VERBOSE: print "Variant Call Format (VCF) guessed"
            looks_like['VCF'] = True

    if isinstance(file_input, str):
        f_in.close()

    if any([looks_like[x] for x in looks_like.keys()]):
        return [x for x in looks_like.keys() if looks_like[x]][0]
    else:
        return 'UNKNOWN'
def detect_format(file_input):
    """Detect the genetic data format of a file.

    Takes a path to a file, or a string generator (e.g. a filehandle).
    Tries to match one of the following:
      23ANDME: 23andme (microarray genotyping)
      CGIVAR: Complete Genomics var file
      deCODEme: deCODEme (microarray genotyping)
      GFF: General Feature Format
      VCF: Variant Call Format (only tested for 23andme exome data)
    """
    looks_like = dict()
    if isinstance(file_input, str):
        try:
            f_in = autozip.file_open(file_input, "r")
        except AssertionError:
            f_in = autozip.file_open(file_input, "r", "deCODEme_scan.csv")
            print "deCODEme archive (deCODEme) detected"
            looks_like["deCODEme"] = True
    else:
        f_in = file_input

    line_count = 0
    for line in f_in:
        line_count += 1
        if any([looks_like[x] for x in looks_like.keys()]):
            break
        if line_count > MAX_LINES_CHECKED:
            break

        # Check comment lines, if they exist, for information on file type.
        if re.match("#", line):
            if re.match(r"#TYPE.*VAR-ANNOTATION", line):
                print "Complete Genomics var file format (CGIVAR) detected"
                looks_like["CGIVAR"] = True
            if re.match(r"##gff-version", line):
                print "General Feature Format (GFF) detected"
                looks_like["GFF"] = True
            if re.match(r"# This data file generated by 23andMe", line):
                print "23andme microarray genotyping data (23ANDME) detected"
                looks_like["23ANDME"] = True
            if re.match(r"##fileformat=VCFv4", line):
                print "Variant Call Format (VCF) detected"
                looks_like["VCF"] = True

        # Look at other lines and decide based on their format.
        tsv_data = line.split("\t")
        csv_data = list(csv.reader([line]))[0]

        if (
            len(csv_data) > 5
            and re.match(r"rs", csv_data[0])
            and re.match(r"[ACGT]", csv_data[1])
            and re.match(r"[0-9]", csv_data[3])
            and re.match(r"[+-]", csv_data[4])
            and re.match(r"[ACGT]", csv_data[5])
        ):
            print "deCODEme microarray genotyping data (deCODEme) guessed"
            looks_like["deCODEme"] = True
        if (
            len(csv_data) > 3
            and re.match(r"rs", csv_data[0])
            and re.match(r"[0-9]", csv_data[2])
            and re.match(r"[ACGT]", csv_data[3])
        ):
            print "Family Tree DNA genotyping data (FTDNA) guessed"
            looks_like["FTDNA"] = True
        if (
            len(tsv_data) > 3
            and re.match(r"rs", tsv_data[0])
            and re.match(r"[0-9]", tsv_data[2])
            and re.match(r"[ACGT][ACGT]", tsv_data[3])
        ):
            print "23andme microarray genotyping data (23ANDME) guessed"
            looks_like["23ANDME"] = True
        if (
            len(tsv_data) > 6
            and re.match(r"chr", tsv_data[3])
            and re.match(r"[0-9]", tsv_data[4])
            and re.match(r"[0-9]", tsv_data[5])
            and (tsv_data[6] == "no-call" or tsv_data[6] == "ref")
        ):
            print "Complete Genomics var file format (CGIvar) guessed"
            looks_like["CGIVAR"] = True
        if (
            len(tsv_data) > 6
            and re.match(r"[0-9]", tsv_data[3])
            and re.match(r"[0-9]", tsv_data[4])
            and tsv_data[6] == "+"
        ):
            print "General Feature Format (GFF) guessed"
            looks_like["GFF"] = True
        if (
            len(tsv_data) > 7
            and re.match(r"[0-9]", tsv_data[1])
            and re.match(r"[ACGT]", tsv_data[3])
            and re.match(r"[ACGT]", tsv_data[4])
            and len(tsv_data[7].split(";")) > 2
        ):
            print "Variant Call Format (VCF) guessed"
            looks_like["VCF"] = True

    if isinstance(file_input, str):
        f_in.close()

    if any([looks_like[x] for x in looks_like.keys()]):
        return [x for x in looks_like.keys() if looks_like[x]][0]
    else:
        return "UNKNOWN"