def extract_recessive_disorder_candidates(input,
                                          output,
                                          gene_column_name='Gene.refGene',
                                          zygozity_column_name='Otherinfo',
                                          delim=','):
    """ extract a part of the annotated table that contains candidates for recessive disorders """
    table_in = open(input, 'r')

    # get right column indexes
    header = quote_aware_split(table_in.readline().strip(), delim)
    gene_col_index = header.index(gene_column_name)
    zygozity_col_index = header.index(zygozity_column_name)

    variant_records_per_gene = {}
    for l in table_in.xreadlines():
        lsplit = quote_aware_split(l, delim)

        gene = lsplit[gene_col_index].strip('"')

        # the gene record can be a list (e.g. overlapping genes), so it needs to be split
        genes = [gene]
        if gene.find(',') >= 0:
            genes = parenthesis_aware_split(gene, delim=',')
        genes = [parenthesis_aware_split(gene, delim=';') for gene in genes]
        genes = set([gene for sublist in genes for gene in sublist
                     ])  # unlist and get unique gene ids only

        for gene in genes:
            # if present, remove suffix in parenthesis
            if gene.find('(') >= 0: gene = gene[:gene.find('(')]
            # put the variant record in the map
            try:
                variant_records_per_gene[gene] += [l]
            except KeyError:
                variant_records_per_gene[gene] = [l]

    table_in.close()

    # write the table with candidates for recessive inheritance model
    table_out = open(output, 'w')
    table_out.write(delim.join(header) + '\n')

    # iterate over the genes and select...
    for gene in variant_records_per_gene:
        # ...these with 2 or more variants...
        if len(variant_records_per_gene[gene]) >= 2:
            for l in variant_records_per_gene[gene]:
                table_out.write(l)
        # or homozygous variants
        else:
            lsplit = quote_aware_split(variant_records_per_gene[gene][0])
            if lsplit[zygozity_col_index].find('"hom\t') >= 0:
                table_out.write(variant_records_per_gene[gene][0])

    table_out.close()
def extract_recessive_disorder_candidates(
    input, output, gene_column_name="Gene.refGene", zygozity_column_name="Otherinfo", delim=","
):
    """ extract a part of the annotated table that contains candidates for recessive disorders """
    table_in = open(input, "r")

    # get right column indexes
    header = quote_aware_split(table_in.readline().strip(), delim)
    gene_col_index = header.index(gene_column_name)
    zygozity_col_index = header.index(zygozity_column_name)

    variant_records_per_gene = {}
    for l in table_in.xreadlines():
        lsplit = quote_aware_split(l, delim)

        gene = lsplit[gene_col_index].strip('"')

        # the gene record can be a list (e.g. overlapping genes), so it needs to be split
        genes = [gene]
        if gene.find(",") >= 0:
            genes = parenthesis_aware_split(gene, delim=",")
        genes = [parenthesis_aware_split(gene, delim=";") for gene in genes]
        genes = set([gene for sublist in genes for gene in sublist])  # unlist and get unique gene ids only

        for gene in genes:
            # if present, remove suffix in parenthesis
            if gene.find("(") >= 0:
                gene = gene[: gene.find("(")]
            # put the variant record in the map
            try:
                variant_records_per_gene[gene] += [l]
            except KeyError:
                variant_records_per_gene[gene] = [l]

    table_in.close()

    # write the table with candidates for recessive inheritance model
    table_out = open(output, "w")
    table_out.write(delim.join(header) + "\n")

    # iterate over the genes and select...
    for gene in variant_records_per_gene:
        # ...these with 2 or more variants...
        if len(variant_records_per_gene[gene]) >= 2:
            for l in variant_records_per_gene[gene]:
                table_out.write(l)
        # or homozygous variants
        else:
            lsplit = quote_aware_split(variant_records_per_gene[gene][0])
            if lsplit[zygozity_col_index].find('"hom\t') >= 0:
                table_out.write(variant_records_per_gene[gene][0])

    table_out.close()
def find_geneset_hits_in_samples(sample_files, geneset, gene_column_name = 'Gene.refGene', delim=',', has_header=True, fields=None):
	"""
	Iterate over the per-sample variant tables (csv, tsv) and look for genes from the geneset.
	The gene column is given in the gene_column_name argument: either by name (if has_header==True), or by index of the column (if has_header==False).
	Returns a dict in form {sample: {gene1:[variant_line], gene2:[variant_line1, variant_line2]}}, where gene1 and gene2 belong to the geneset, 
	and variant_line is the entire line from csv file.
	As the same gene can appear twice with distinct variants, the dict[sample][gene] is a list.
	"""
	
	map={s:{} for s in sample_files}
	
	for fname in sample_files:
		sys.stderr.write('Processing '+ fname + '...')
		f = open(fname)
		
		gene_col_index = -1
		if has_header:
			header = quote_aware_split(f.readline().strip(), delim)
			gene_col_index = header.index(gene_column_name)
		else:
			gene_col_index = int(gene_column_name)
			
		
		for l in f.xreadlines():
			gene_entry = quote_aware_split(l, delim)[gene_col_index]
			gene_entry = gene_entry.strip().strip('"') 	# clean the gene name		
			genes=[gene_entry]						
			if gene_entry.find(',') >= 0: 
				genes = parenthesis_aware_split(gene_entry,delim=',')	# split multi gene entries
			genes = [parenthesis_aware_split(gene,delim=';') for gene in genes]
			genes = set([gene for sublist in genes for gene in sublist])  # get unique gene ids only
			
			for gene in genes:
				if gene.find('(') > 0: 
					gene = gene[:gene.find('(')] # strip the transcript change in parenthesis
			
				if gene in geneset:
					# clean the records
					variant_record = '\t'.join([e.strip().strip('"') for e in quote_aware_split(l, delim)])
					# if requested, select a subset of fields
					if fields != None:
						variant_record = '\t'.join([variant_record.split('\t')[i] for i in fields])

					try:
						map[fname][gene] += [variant_record]
					except KeyError:
						map[fname][gene] = [variant_record]
										
		f.close()
	        sys.stderr.write('done\n')
		
	return map
def include_omim_phenotype_annotation(inputs,
                                      output_table,
                                      gene_column=7,
                                      omim_column=15,
                                      delim=','):
    """ include OMIM phenotype into the annotation table """
    table_in = open(inputs[1], 'r')
    table_out = open(output_table, 'w')

    # header
    header_in = quote_aware_split(table_in.readline(), delim)
    if omim_column <= 0:
        omim_column = len(header_in) + 1
    header_out = header_in[:omim_column - 1] + ['omim_phenotype'
                                                ] + header_in[omim_column - 1:]
    table_out.write(delim.join(header_out))

    # the rest of the table
    for l in table_in.xreadlines():
        lsplit = quote_aware_split(l, delim)
        gene = lsplit[gene_column - 1].strip('"')

        # the gene record can be a list (e.g. overlapping genes), so it needs to be split
        genes = [gene]
        if gene.find(',') >= 0:
            genes = parenthesis_aware_split(gene, delim=',')
        genes = [parenthesis_aware_split(gene, delim=';') for gene in genes]
        genes = set([gene for sublist in genes for gene in sublist
                     ])  # unlist and get unique gene ids only

        for gene in genes:
            # if present, remove suffix in parenthesis
            if gene.find('(') >= 0:
                gene = gene[:gene.find('(')]

            # put the variant record in the map
            try:
                omim_phenotype = omim_gene_phenotype_map[gene]
            except KeyError:
                omim_phenotype = 'NA'

        table_out.write(
            delim.join(lsplit[:omim_column - 1] +
                       ['"' + omim_phenotype + '"'] +
                       lsplit[omim_column - 1:]))

    table_in.close()
    table_out.close()
def include_omim_phenotype_annotation(inputs, output_table, gene_column=7, omim_column=15, delim=","):
    """ include OMIM phenotype into the annotation table """
    table_in = open(inputs[1], "r")
    table_out = open(output_table, "w")

    # header
    header_in = quote_aware_split(table_in.readline(), delim)
    if omim_column <= 0:
        omim_column = len(header_in) + 1
    header_out = header_in[: omim_column - 1] + ["omim_phenotype"] + header_in[omim_column - 1 :]
    table_out.write(delim.join(header_out))

    # the rest of the table
    for l in table_in.xreadlines():
        lsplit = quote_aware_split(l, delim)
        gene = lsplit[gene_column - 1].strip('"')

        # the gene record can be a list (e.g. overlapping genes), so it needs to be split
        genes = [gene]
        if gene.find(",") >= 0:
            genes = parenthesis_aware_split(gene, delim=",")
        genes = [parenthesis_aware_split(gene, delim=";") for gene in genes]
        genes = set([gene for sublist in genes for gene in sublist])  # unlist and get unique gene ids only

        for gene in genes:
            # if present, remove suffix in parenthesis
            if gene.find("(") >= 0:
                gene = gene[: gene.find("(")]

            # put the variant record in the map
            try:
                omim_phenotype = omim_gene_phenotype_map[gene]
            except KeyError:
                omim_phenotype = "NA"

        table_out.write(
            delim.join(lsplit[: omim_column - 1] + ['"' + omim_phenotype + '"'] + lsplit[omim_column - 1 :])
        )

    table_in.close()
    table_out.close()