Ejemplo n.º 1
0
def wiggleReader(f):
    '''
	Read wiggle (http://genome.ucsc.edu/goldenPath/help/wiggle) file of different styles.

	Parameters
	----------
	f : file
		file in wiggle format. Can be fixedStep, variableStep, or bed4

	Yields
	------
	chrom, start, end, strand, score
	'''
    current_chrom = None
    current_pos = None
    current_step = None

    # always for wiggle data
    strand = '+'

    mode = "bed"
    for line in ireader.reader(f):
        if line.isspace() or line.startswith(("track", "#", "browser")):
            continue
        elif line.startswith("variableStep"):
            header = parse_header(line)
            current_chrom = header['chrom']
            current_pos = None
            current_step = None
            if 'span' in header: current_span = int(header['span'])
            else: current_span = 1
            mode = "variableStep"
        elif line.startswith("fixedStep"):
            header = parse_header(line)
            current_chrom = header['chrom']
            current_pos = int(header['start']) - 1
            current_step = int(header['step'])
            if 'span' in header: current_span = int(header['span'])
            else: current_span = 1
            mode = "fixedStep"
        elif mode == "bed":
            fields = line.split()
            if len(fields) > 3:
                if len(fields) > 5:
                    yield fields[0], int(fields[1]), int(
                        fields[2]), fields[5], float(fields[3])
                else:
                    yield fields[0], int(fields[1]), int(
                        fields[2]), strand, float(fields[3])
        elif mode == "variableStep":
            fields = line.split()
            pos = int(fields[0]) - 1
            yield current_chrom, pos, pos + current_span, strand, float(
                fields[1])
        elif mode == "fixedStep":
            yield current_chrom, current_pos, current_pos + current_span, strand, float(
                line.split()[0])
            current_pos += current_step
        else:
            raise "Unexpected input line: %s" % line.strip()
Ejemplo n.º 2
0
def read_chain_file(chain_file, print_table=False):
    '''
	Read chain file.

	Parameters
	----------
	chain_file : file
		Chain format file. Input chain_file could be either plain text, compressed file
		(".gz",".Z", ".z", ".bz", ".bz2", ".bzip2"), or a URL pointing to the chain file
		("http://","https://", "ftp://"). If url was used, chain file must be plain text.

	print_table : bool, optional
		Print mappings in human readable table.

	Returns
	-------
	maps : dict
		Dictionary with source chrom name as key, IntervalTree object as value. An
		IntervalTree contains many intervals. An interval is a start and end position
		and a value. eg. Interval(11, 12, strand="-", value = "abc")

	target_chromSize : dict
		Chromosome sizes of target genome

	source_chromSize : dict
		Chromosome sizes of source genome
	'''

    logging.info("Read the chain file \"%s\" " % chain_file)
    maps = {}
    target_chromSize = {}
    source_chromSize = {}
    if print_table:
        blocks = []

    for line in ireader.reader(chain_file):
        # Example: chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1
        if not line.strip():
            continue
        line = line.strip()
        if line.startswith(('#', ' ')): continue
        fields = line.split()

        if fields[0] == 'chain' and len(fields) in [12, 13]:
            #score = int(fields[1])		  # Alignment score
            source_name = fields[2]  # E.g. chrY
            source_size = int(fields[3])  # Full length of the chromosome
            source_strand = fields[4]  # Must be +
            if source_strand != '+':
                raise Exception(
                    "Source strand in a chain file must be +. (%s)" % line)
            source_start = int(fields[5])  # Start of source region
            #source_end = int(fields[6])	  # End of source region

            target_name = fields[7]  # E.g. chr5
            target_size = int(fields[8])  # Full length of the chromosome
            target_strand = fields[9]  # + or -
            target_start = int(fields[10])
            #target_end = int(fields[11])
            target_chromSize[target_name] = target_size
            source_chromSize[source_name] = source_size

            if target_strand not in ['+', '-']:
                raise Exception("Target strand must be - or +. (%s)" % line)
            #chain_id = None if len(fields) == 12 else fields[12]
            if source_name not in maps:
                maps[source_name] = Intersecter()

            sfrom, tfrom = source_start, target_start

        # Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to)
        elif fields[0] != 'chain' and len(fields) == 3:
            size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2])
            if print_table:
                if target_strand == '+':
                    blocks.append(
                        (source_name, sfrom, sfrom + size, source_strand,
                         target_name, tfrom, tfrom + size, target_strand))
                elif target_strand == '-':
                    blocks.append(
                        (source_name, sfrom, sfrom + size, source_strand,
                         target_name, target_size - (tfrom + size),
                         target_size - tfrom, target_strand))

            if target_strand == '+':
                maps[source_name].add_interval(
                    Interval(
                        sfrom, sfrom + size,
                        (target_name, tfrom, tfrom + size, target_strand)))
            elif target_strand == '-':
                maps[source_name].add_interval(
                    Interval(sfrom, sfrom + size,
                             (target_name, target_size - (tfrom + size),
                              target_size - tfrom, target_strand)))

            sfrom += size + sgap
            tfrom += size + tgap

        elif fields[0] != 'chain' and len(fields) == 1:
            size = int(fields[0])
            if print_table:
                if target_strand == '+':
                    blocks.append(
                        (source_name, sfrom, sfrom + size, source_strand,
                         target_name, tfrom, tfrom + size, target_strand))
                elif target_strand == '-':
                    blocks.append(
                        (source_name, sfrom, sfrom + size, source_strand,
                         target_name, target_size - (tfrom + size),
                         target_size - tfrom, target_strand))

            if target_strand == '+':
                maps[source_name].add_interval(
                    Interval(
                        sfrom, sfrom + size,
                        (target_name, tfrom, tfrom + size, target_strand)))
            elif target_strand == '-':
                maps[source_name].add_interval(
                    Interval(sfrom, sfrom + size,
                             (target_name, target_size - (tfrom + size),
                              target_size - tfrom, target_strand)))
        else:
            raise Exception("Invalid chain format. (%s)" % line)
    #if (sfrom + size) != source_end  or (tfrom + size) != target_end:
    #	 raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header)

    if print_table:
        for i in blocks:
            print('\t'.join([str(n) for n in i]))

    return (maps, target_chromSize, source_chromSize)
Ejemplo n.º 3
0
def crossmap_vcf_file(mapping,
                      infile,
                      outfile,
                      liftoverfile,
                      refgenome,
                      noCompAllele=False,
                      compress=False):
    '''
	Convert genome coordinates in VCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.

	noCompAllele : bool
		A logical value indicates whether to compare ref_allele to alt_allele after
		liftover. If True, the variant will be marked as "unmap" if
		ref_allele == alt_allele.
	'''

    if noCompAllele:
        printlog(
            ["Keep variants [reference_allele == alternative_allele] ..."])
    else:
        printlog([
            "Filter out variants [reference_allele == alternative_allele] ..."
        ])

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        printlog(["Creating index for", refgenome])
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total = 0
    fail = 0
    withChr = False  # check if the VCF data lines use 'chr1' or '1'

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #deal with meta-information lines.
        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('##fileformat'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##INFO'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FILTER'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FORMAT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##ALT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##SAMPLE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##PEDIGREE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)

        #meta-information lines needed in unmapped files
        elif line.startswith('##assembly'):
            print(line, file=UNMAP)
        elif line.startswith('##contig'):
            print(line, file=UNMAP)
            if 'ID=chr' in line:
                withChr = True

        #update contig information
        elif line.startswith('#CHROM'):
            printlog(["Updating contig field ... "])
            target_gsize = dict(
                list(zip(refFasta.references, refFasta.lengths)))
            for chr_id in sorted(target_gsize):
                if chr_id.startswith('chr'):
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id.replace('chr', ''), target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                else:
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              ('chr' + chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)

            print(
                "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>"
                % __version__,
                file=FILE_OUT)
            print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
            print("##originalFile=<%s>" % infile, file=FILE_OUT)
            print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
            print("##liftOverDate=<%s>" %
                  datetime.date.today().strftime("%B%d,%Y"),
                  file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            printlog(["Lifting over ... "])

        else:
            if line.startswith('#'): continue
            fields = str.split(line, maxsplit=7)
            total += 1

            chrom = fields[0]
            start = int(fields[1]) - 1  # 0 based
            end = start + len(fields[3])

            a = map_coordinates(mapping, chrom, start, end, '+')
            if a is None:
                print(line + "\tFail(Unmap)", file=UNMAP)
                fail += 1
                continue

            if len(a) == 2:
                # update chrom
                target_chr = str(
                    a[1][0]
                )  #target_chr is from chain file, could be 'chr1' or '1'
                target_start = a[1][1]
                target_end = a[1][2]
                fields[0] = target_chr

                # update start coordinate
                fields[1] = target_start + 1

                # update ref allele
                target_chr = update_chromID(refFasta.references[0], target_chr)
                try:
                    fields[3] = refFasta.fetch(target_chr, target_start,
                                               target_end).upper()
                except:
                    print(line + "\tFail(KeyError)", file=UNMAP)
                    fail += 1
                    continue

                # update END if any
                fields[7] = re.sub('END\=\d+', 'END=' + str(target_end),
                                   fields[7])

                if a[1][3] == '-':
                    fields[4] = revcomp_DNA(fields[4], True)

                # check if ref_allele is the same as alt_allele
                if noCompAllele:
                    print('\t'.join(map(str, fields)), file=FILE_OUT)
                else:
                    if fields[3] != fields[4]:
                        print('\t'.join(map(str, fields)), file=FILE_OUT)
                    else:
                        print(line + "\tFail(REF==ALT)", file=UNMAP)
                        fail += 1
            else:
                print(line + "\tFail(Multiple_hits)", file=UNMAP)
                fail += 1
                continue
    FILE_OUT.close()
    UNMAP.close()

    printlog(["Total entries:", str(total)])
    printlog(["Failed to map:", str(fail)])

    if compress:
        try:
            printlog(["Compressing \"%s\" ..." % outfile])
            subprocess.call("gzip " + outfile, shell=True)
        except:
            pass
Ejemplo n.º 4
0
def crossmap_bed_file(mapping,
                      inbed,
                      outfile=None,
                      unmapfile=None,
                      cstyle='a'):
    '''
	Convert genome coordinates (in bed format) between assemblies.
	BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	inbed : file
		Input BED file.

	outfile : str, optional
		Prefix of output files.

	unmapfile: str, optional
		Name of file to save unmapped entries. This option will be ignored if outfile is None.

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

    # check if 'outfile' was set. If not set, print to screen, if set, print to file
    if outfile is not None:
        FILE_OUT = open(outfile, 'w')
        if unmapfile is not None:
            UNMAP = open(unmapfile, 'w')
        else:
            UNMAP = open(outfile + '.unmap', 'w')
    else:
        pass

    for line in ireader.reader(inbed):
        if line.startswith(('#', 'track', 'browser')): continue
        if not line.strip(): continue
        line = line.strip()
        fields = line.split()
        strand = '+'

        # filter out line less than 3 columns
        if len(fields) < 3:
            print("Less than 3 fields. skip " + line, file=sys.stderr)
            if outfile:
                print(line + '\tInvalidBedFormat', file=UNMAP)
            continue
        try:
            int(fields[1])
        except:
            print("Start coordinate is not an integer. skip " + line,
                  file=sys.stderr)
            if outfile:
                print(line + '\tInvalidStartPosition', file=UNMAP)
            continue
        try:
            int(fields[2])
        except:
            print("End coordinate is not an integer. skip " + line,
                  file=sys.stderr)
            if outfile:
                print(line + '\tInvalidEndPosition', file=UNMAP)
            continue
        if int(fields[1]) > int(fields[2]):
            print(
                "\"Start\" is larger than \"End\" coordinate is not an integer. skip "
                + line,
                file=sys.stderr)
            if outfile:
                print(line + '\tStart>End', file=UNMAP)
            continue

        # deal with bed less than 12 columns
        if len(fields) < 12:

            # try to reset strand
            try:
                for f in fields:
                    if f in ['+', '-']:
                        strand = f
            except:
                pass

            chrom = fields[0]
            start = int(fields[1])
            end = int(fields[2])

            a = map_coordinates(mapping,
                                chrom,
                                start,
                                end,
                                strand,
                                chrom_style=cstyle)

            try:
                if (a is None) or (len(a) % 2 != 0):
                    if outfile is None:
                        print(line + '\tUnmap')
                    else:
                        print(line + '\tUnmap', file=UNMAP)
                    continue

                if len(a) == 2:
                    #reset fields
                    fields[0] = a[1][0]
                    fields[1] = a[1][1]
                    fields[2] = a[1][2]
                    for i in range(
                            0, len(fields)):  #update the strand information
                        if fields[i] in ['+', '-']:
                            fields[i] = a[1][3]

                    if outfile is None:
                        print(line + '\t->\t' +
                              '\t'.join([str(i) for i in fields]))
                    else:
                        print('\t'.join([str(i) for i in fields]),
                              file=FILE_OUT)
                if len(a) > 2:
                    count = 0
                    for j in range(1, len(a), 2):
                        count += 1
                        fields[0] = a[j][0]
                        fields[1] = a[j][1]
                        fields[2] = a[j][2]
                        for i in range(
                                0,
                                len(fields)):  #update the strand information
                            if fields[i] in ['+', '-']:
                                fields[i] = a[j][3]

                        if outfile is None:
                            print(line + '\t' + '(split.' + str(count) + ':' +
                                  ':'.join([str(i) for i in a[j - 1]]) +
                                  ')\t' + '\t'.join([str(i) for i in fields]))
                        else:
                            print('\t'.join([str(i) for i in fields]),
                                  file=FILE_OUT)
            except:
                if outfile is None:
                    print(line + '\tFail')
                else:
                    print(line + '\tFail', file=UNMAP)
                continue

        # deal with bed12 and bed12+8 (genePred format)
        if len(fields) == 12 or len(fields) == 20:
            strand = fields[5]
            if strand not in ['+', '-']:
                raise Exception("Unknown strand: %s. Can only be '+' or '-'." %
                                strand)
            fail_flag = False
            exons_old_pos = annoGene.getExonFromLine(
                line)  #[[chr,st,end],[chr,st,end],...]
            #print exons_old_pos
            exons_new_pos = []
            for e_chr, e_start, e_end in exons_old_pos:
                # a has two elements, first is query, 2nd is target. # [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')]
                a = map_coordinates(mapping,
                                    e_chr,
                                    e_start,
                                    e_end,
                                    strand,
                                    chrom_style=cstyle)
                if a is None:
                    fail_flag = True
                    break

                if len(a) == 2:
                    exons_new_pos.append(a[1])
                else:
                    fail_flag = True
                    break

            if not fail_flag:
                # check if all exons were mapped to the same chromosome and the same strand
                chr_id = set()
                exon_strand = set()

                for e_chr, e_start, e_end, e_strand in exons_new_pos:
                    chr_id.add(e_chr)
                    exon_strand.add(e_strand)
                if len(chr_id) != 1 or len(exon_strand) != 1:
                    fail_flag = True

                if not fail_flag:
                    # build new bed
                    cds_start_offset = int(fields[6]) - int(fields[1])
                    cds_end_offset = int(fields[2]) - int(fields[7])
                    new_chrom = exons_new_pos[0][0]
                    new_chrom_st = exons_new_pos[0][1]
                    new_chrom_end = exons_new_pos[-1][2]
                    new_name = fields[3]
                    new_score = fields[4]
                    new_strand = exons_new_pos[0][3]
                    new_thickStart = new_chrom_st + cds_start_offset
                    new_thickEnd = new_chrom_end - cds_end_offset
                    new_ittemRgb = fields[8]
                    new_blockCount = len(exons_new_pos)
                    new_blockSizes = ','.join(
                        [str(o - n) for m, n, o, p in exons_new_pos])
                    new_blockStarts = ','.join([
                        str(n - new_chrom_st) for m, n, o, p in exons_new_pos
                    ])

                    new_bedline = '\t'.join(
                        str(i)
                        for i in (new_chrom, new_chrom_st, new_chrom_end,
                                  new_name, new_score, new_strand,
                                  new_thickStart, new_thickEnd, new_ittemRgb,
                                  new_blockCount, new_blockSizes,
                                  new_blockStarts))
                    if check_bed12(new_bedline) is False:
                        fail_flag = True
                    else:
                        if outfile is None:
                            print(line + '\t->\t' + new_bedline)
                        else:
                            print(new_bedline, file=FILE_OUT)

            if fail_flag:
                if outfile is None:
                    print(line + '\tFail')
                else:
                    print(line, file=UNMAP)
Ejemplo n.º 5
0
def crossmap_gff_file(mapping, ingff, outfile=None, cstyle='a'):
    '''
	Description
	-----------
	Convert genome coordinates (in GFF/GTF format) between assemblies.
	GFF (General Feature Format) lines have nine required fields that must be Tab-separated:

	1. seqname - The name of the sequence. Must be a chromosome or scaffold.
	2. source - The program that generated this feature.
	3. feature - The name of this type of feature. Some examples of standard feature types
	   are "CDS", "start_codon", "stop_codon", and "exon".
	4. start - The starting position of the feature in the sequence. The first base is numbered 1.
	5. end - The ending position of the feature (inclusive).
	6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1
	   for this annotation data set, the score value will determine the level of gray in
	   which this feature is displayed (higher numbers = darker gray). If there is no score
	   value, enter ".".
	7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care).
	8. frame - If the feature is a coding exon, frame should be a number between 0-2 that
	   represents the reading frame of the first base. If the feature is not a coding exon,
	   the value should be '.'.
	9. group - All lines with the same group are linked together into a single item.

	GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3

	GTF (Gene Transfer Format) is a refinement to GFF that tightens the specification. The
	first eight GTF fields are the same as GFF. The group field has been expanded into a
	list of attributes. Each attribute consists of a type/value pair. Attributes must end
	in a semi-colon, and be separated from any following attribute by exactly one space.

	GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4

	We do NOT check if features (exon, CDS, etc) originally belonging to the same gene  were
	converted into the same chromosome/strand.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	ingff : file
		Input GFF/GTF file.

	outfile : str, optional
		Prefix of output files.

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

    if outfile is not None:
        rand_str = ''.join(
            random.choices(string.ascii_uppercase + string.digits, k=8))
        FILE_OUT = open(outfile, 'w')
        UNMAP = open(outfile + '.' + rand_str + '.unmap', 'w')

    for line in ireader.reader(ingff):
        if line.startswith(('#', 'track', 'browser', 'visibility')): continue
        if not line.strip(): continue

        line = line.strip()
        fields = line.split('\t')
        try:
            start = int(fields[3]) - 1  #0-based
            end = int(fields[4]) / 1
            feature_size = end - start
        except:
            print('Cannot recognize \"start\" and \"end\" coordinates. Skip ' +
                  line,
                  file=sys.stderr)
            if outfile:
                print(line, file=UNMAP)
            continue
        if fields[6] not in ['+', '-', '.']:
            print('Cannot recognize \"strand\". Skip ' + line, file=sys.stderr)
            if outfile:
                print(line, file=UNMAP)
            continue

        strand = '-' if fields[6] == '-' else '+'

        chrom = fields[0]
        a = map_coordinates(mapping,
                            chrom,
                            start,
                            end,
                            strand,
                            chrom_style=cstyle)

        if a is None:
            if outfile is None:
                print(line + '\tfail (no match to target assembly)')
            else:
                print(line, file=UNMAP)
            continue
        if len(a) != 2:
            if outfile is None:
                print(line + '\tfail (multpile match to target assembly)')
            else:
                print(line, file=UNMAP)
        else:
            if (int(a[1][2]) - int(
                    a[1][1])) != feature_size:  # check if it is exact match
                if outfile is None:
                    print(line + '\tfail (not exact match)')
                else:
                    print(line, file=UNMAP)
            fields[0] = a[1][0]  # chrom
            fields[3] = int(a[1][1]) + 1  # start, 1-based
            fields[4] = int(a[1][2])
            fields[6] = a[1][3]

            if outfile is None:
                print(line + '\t->\t' + '\t'.join([str(i) for i in fields]))
            else:
                print('\t'.join([str(i) for i in fields]), file=FILE_OUT)
Ejemplo n.º 6
0
def crossmap_maf_file(mapping, infile, outfile, liftoverfile, refgenome,
                      ref_name):
    '''
	Convert genome coordinates in MAF (mutation annotation foramt) format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.
	ref_name : str
		The NCBI build name of the target assembly, for example, "GRCh37", "GRCh38".
	'''

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        logging.info("Creating index for: %s" % refgenome)
        pysam.faidx(refgenome)
    if os.path.getctime(refgenome + '.fai') < os.path.getctime(refgenome):
        logging.info(
            "Index file is older than reference genome. Re-creating index for: %s"
            % refgenome)
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total = 0
    fail = 0

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('#'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            continue
        elif line.startswith('Hugo_Symbol'):
            print(
                "#liftOver: Program=%sv%s, Time=%s, ChainFile=%s, NewRefGenome=%s"
                % ("CrossMap", __version__,
                   datetime.date.today().strftime("%B%d,%Y"), liftoverfile,
                   refgenome),
                file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            logging.info("Lifting over ... ")
        else:

            fields = str.split(line, sep='\t')
            total += 1

            fields[3] = ref_name
            chrom = fields[4]
            start = int(fields[5]) - 1  # 0 based
            end = int(fields[6])
            #strand = fields[7]

            a = map_coordinates(mapping, chrom, start, end, '+')

            if a is None:
                print(line, file=UNMAP)
                fail += 1
                continue

            if len(a) == 2:
                target_chr = str(
                    a[1][0]
                )  #target_chr is from chain file, could be 'chr1' or '1'
                target_start = a[1][1]
                target_end = a[1][2]

                # update chrom
                fields[4] = target_chr

                # update start coordinate
                fields[5] = target_start + 1

                # update end
                fields[6] = target_end

                # update ref allele
                try:
                    target_chr = update_chromID(refFasta.references[0],
                                                target_chr)
                    fields[10] = refFasta.fetch(target_chr, target_start,
                                                target_end).upper()
                except:
                    print(line, file=UNMAP)
                    fail += 1
                    continue

                if a[1][3] == '-':
                    fields[10] = revcomp_DNA(fields[10], True)
                print('\t'.join(map(str, fields)), file=FILE_OUT)

            else:
                print(line, file=UNMAP)
                fail += 1
                continue
    FILE_OUT.close()
    UNMAP.close()
    logging.info("Total entries: %d", total)
    logging.info("Failed to map: %d", fail)
Ejemplo n.º 7
0
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele = False, compress = False, cstyle = 'a'):
	'''
	Convert genome coordinates in GVCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.

	noCompAllele : bool
		A logical value indicates whether to compare ref_allele to alt_allele after
		liftover. If True, the variant will be marked as "unmap" if
		ref_allele == alt_allele.

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

	if noCompAllele:
		logging.info("Keep variants [reference_allele == alternative_allele] ...")
	else:
		logging.info("Filter out variants [reference_allele == alternative_allele] ...")

	#index refegenome file if it hasn't been done
	if not os.path.exists(refgenome + '.fai'):
		logging.info("Creating index for: %s" % refgenome)
		pysam.faidx(refgenome)
	if os.path.getmtime(refgenome + '.fai') < os.path.getmtime(refgenome):
		logging.info("Index file is older than reference genome. Re-creating index for: %s" % refgenome)
		pysam.faidx(refgenome)

	refFasta = pysam.Fastafile(refgenome)

	FILE_OUT = open(outfile ,'w')
	UNMAP = open(outfile + '.unmap','w')

	total_var = 0
	failed_var = 0
	total_region = 0
	failed_region = 0
	withChr = False # check if the VCF data lines use 'chr1' or '1'

	for line in ireader.reader(infile):
		if not line.strip():
			continue
		line=line.strip()

		#deal with meta-information lines.
		#meta-information lines needed in both mapped and unmapped files
		if line.startswith('##fileformat'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##INFO'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##FILTER'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##FORMAT'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##ALT'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##SAMPLE'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##PEDIGREE'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##GVCFBlock'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##GATKCommandLine'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##source'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)

		#meta-information lines needed in unmapped files
		elif line.startswith('##assembly'):
			print(line, file=UNMAP)
		elif line.startswith('##contig'):
			print(line, file=UNMAP)
			if 'ID=chr' in line:
				chr_template = 'chr1'
			else:
				chr_template = '1'

		#update contig information
		elif line.startswith('#CHROM'):
			logging.info("Updating contig field ... ")
			target_gsize = dict(list(zip(refFasta.references, refFasta.lengths)))
			for chr_id in sorted(target_gsize):
				if chr_id.startswith('chr'):
					#if withChr is True:
					print("##contig=<ID=%s,length=%d,assembly=%s>" % (update_chromID(chr_template, chr_id, cstyle), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT)

			print("##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT)
			print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
			print("##originalFile=<%s>" % infile, file=FILE_OUT)
			print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
			print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT)
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
			logging.info("Lifting over ... ")

		else:
			if line.startswith('#'):continue

			# process non-variant region
			if 'END=' in line:
				fields = str.split(line,maxsplit=8)
				total_region += 1
				chrom = fields[0]
				start = int(fields[1])-1	 # 0 based
				try:
					m = re.search(r"END\=(\d+)", line)
					end = int(m[1])
				except:
					print (line + "\tFail(Unmap)", file=UNMAP)
					failed_region += 1
					continue

				a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle)
				if a is None:
					print (line + "\tFail(Unmap)", file=UNMAP)
					failed_region += 1
					continue
				if len(a) == 2:
					# update chrom
					target_chr = str(a[1][0])	#target_chr is from chain file, could be 'chr1' or '1'
					target_start = a[1][1]
					target_end = a[1][2]
					fields[0] = target_chr

					# update start coordinate
					fields[1] = target_start + 1

					# update END
					fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end)))
					print('\t'.join(map(str, fields)), file=FILE_OUT)

			# process variant line
			else:

				fields = str.split(line,maxsplit=7)
				total_var += 1
				chrom = fields[0]
				start = int(fields[1])-1	 	# 0 based, ref_allele start
				end = start + len(fields[3])	# ref_allele end
				alt_allele = fields[4].replace(' ','').split(',')[0]	# 20  10000598    .   T   A,<NON_REF> 1754.77 .   DP=54;

				a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle)
				if a is None:
					print (line + "\tFail(Unmap)", file=UNMAP)
					failed_var += 1
					continue

				if len(a) == 2:
					# update chrom
					target_chr = str(a[1][0])	#target_chr is from chain file, could be 'chr1' or '1'
					target_start = a[1][1]
					target_end = a[1][2]
					fields[0] = target_chr

					# update start coordinate
					fields[1] = target_start + 1

					# update ref allele
					try:
						target_chr = update_chromID(refFasta.references[0], target_chr)
						fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper()
					except:
						print(line+ "\tFail(No_targetRef)", file=UNMAP)
						failed_var += 1

					if a[1][3] == '-':
						fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>'

					# check if ref_allele is the same as alt_allele
					if noCompAllele:
						print('\t'.join(map(str, fields)), file=FILE_OUT)
					else:
						if fields[3] != fields[4]:
							print('\t'.join(map(str, fields)), file=FILE_OUT)
						else:
							print (line + "\tFail(REF==ALT)", file=UNMAP)
							failed_var += 1

				else:
					print (line + "\tFail(Multiple_hits)", file=UNMAP)
					failed_var += 1
					continue
	FILE_OUT.close()
	UNMAP.close()
	logging.info ("Total variants: %d" % total_var)
	logging.info ("Variants failed to map: %d" % failed_var)
	logging.info ("Total non-variant regions: %d" % total_region)
	logging.info ("Non-variant regions failed to map: %d" % failed_region)

	if compress:
		try:
			logging.info("Compressing \"%s\" ..." % outfile)
			subprocess.call("gzip " + outfile, shell=True)
		except:
			pass
Ejemplo n.º 8
0
def crossmap_region_file(mapping,
                         inbed,
                         outfile=None,
                         min_ratio=0.85,
                         cstyle='a'):
    '''
	Convert large genomic regions (in bed format) between assemblies.
	BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	inbed : file
		Input BED file.

	outfile : str, optional
		Prefix of output files.

	min_ratio : float, optional
		Minimum ratio of query bases that must remap

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

    # check if 'outfile' was set. If not set, print to screen, if set, print to file
    if outfile is not None:
        FILE_OUT = open(outfile, 'w')
        UNMAP = open(outfile + '.unmap', 'w')
    else:
        pass

    for line in ireader.reader(inbed):
        if line.startswith(('#', 'track', 'browser')): continue
        if not line.strip(): continue
        line = line.strip()
        fields = line.split()
        strand = '+'

        # filter out line less than 3 columns
        if len(fields) < 3:
            print("Less than 3 fields. skip " + line, file=sys.stderr)
            if outfile:
                print(line + '\tInvalidBedFormat', file=UNMAP)
            continue
        try:
            int(fields[1])
        except:
            print("Start coordinate is not an integer. skip " + line,
                  file=sys.stderr)
            if outfile:
                print(line + '\tInvalidStartPosition', file=UNMAP)
            continue
        try:
            int(fields[2])
        except:
            print("End coordinate is not an integer. skip " + line,
                  file=sys.stderr)
            if outfile:
                print(line + '\tInvalidEndPosition', file=UNMAP)
            continue
        if int(fields[1]) > int(fields[2]):
            print(
                "\"Start\" is larger than \"End\" coordinate is not an integer. skip "
                + line,
                file=sys.stderr)
            if outfile:
                print(line + '\tStart>End', file=UNMAP)
            continue

        # try to reset strand
        try:
            for f in fields:
                if f in ['+', '-']:
                    strand = f
        except:
            pass

        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        total_query_length = end - start  #used to calculate q_map_ratio

        a = map_coordinates(mapping,
                            chrom,
                            start,
                            end,
                            strand,
                            chrom_style=cstyle)
        # input: 'chr1',246974830,247024835
        # output: [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' ), ('chr1', 247024833, 247024835, '+'), ('chr1', 249058210, 249058212,'+')]
        # [('chr1', 246974830, 246974833), ('chr1', 248908207, 248908210)]

        if (a is None) or (len(a) % 2 != 0):
            if outfile is None:
                print(line + '\tFail\tUnmap')
            else:
                print(line + '\tFail\tUnmap', file=UNMAP)
            continue

        #when a == 2, there is one-to-one match (i.e. 100% match)
        if len(a) == 2:
            #reset fields to target assembly
            fields[0] = a[1][0]
            fields[1] = a[1][1]
            fields[2] = a[1][2]
            for i in range(0, len(fields)):  #update the strand information
                if fields[i] in ['+', '-']:
                    fields[i] = a[1][3]

            if outfile is None:
                print(line + '\t->\t' + '\t'.join([str(i) for i in fields]) +
                      "\tmap_ratio=1.0000")
            else:
                print('\t'.join([str(i)
                                 for i in fields]) + "\tmap_ratio=1.0000",
                      file=FILE_OUT)

        #when a is an even number but bigger than 2, each segment is 100% match,
        # but the whole region is not. In this case, check *min_ratio* of the query
        if len(a) > 2:
            a_query = a[::
                        2]  #EVEN: [('chr1', 246974830, 246974833, '+'), ('chr1', 247024833, 247024835, '+')]
            a_query_mapped_nt = sum([i[2] - i[1]
                                     for i in a_query])  #sum([3,2])
            a_target = a[
                1::
                2]  #ODDS: [('chr1', 248908207, 248908210, '+'), ('chr1', 249058210, 249058212, '+')]
            a_target_chroms = set([i[0] for i in a_target])
            a_target_chroms = set([i[0] for i in a_target])
            a_target_starts = [i[1] for i in a_target]
            a_target_ends = [i[2] for i in a_target]
            #print (a_target_ends)
            map_ratio = a_query_mapped_nt / total_query_length

            #map_ratio > cutoff
            if map_ratio >= min_ratio:
                if len(a_target_chroms) == 1:
                    t_chrom = a_target_chroms.pop()
                    fields[0] = t_chrom
                    fields[1] = min(a_target_starts)
                    fields[2] = max(a_target_ends)
                    if outfile is None:
                        print(line + '\t->\t' +
                              '\t'.join([str(i) for i in fields]) +
                              ("\tmap_ratio=%.4f" % map_ratio))
                    else:
                        print('\t'.join([str(i) for i in fields]) +
                              ("\tmap_ratio=%.4f" % map_ratio),
                              file=FILE_OUT)
                else:
                    if outfile is None: print(line + '\tFail\tCrossChroms')
                    else: print(line + '\tFail\tCrossChroms', file=UNMAP)
            # map_ratio > 0 but < cutoff
            elif map_ratio > 0 and map_ratio < min_ratio:
                if outfile is None:
                    print(line + '\tFail' + ("\tmap_ratio=%.4f" % map_ratio))
                else:
                    print(line + '\tFail' + ("\tmap_ratio=%.4f" % map_ratio),
                          file=UNMAP)
Ejemplo n.º 9
0
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome):
    '''
	Convert genome coordinates in GVCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.
	'''

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        printlog(["Creating index for", refgenome])
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total_var = 0
    failed_var = 0
    total_region = 0
    failed_region = 0
    withChr = False  # check if the VCF data lines use 'chr1' or '1'

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #deal with meta-information lines.
        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('##fileformat'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##INFO'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FILTER'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FORMAT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##ALT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##SAMPLE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##PEDIGREE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##GVCFBlock'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##GATKCommandLine'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##source'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)

        #meta-information lines needed in unmapped files
        elif line.startswith('##assembly'):
            print(line, file=UNMAP)
        elif line.startswith('##contig'):
            print(line, file=UNMAP)
            if 'ID=chr' in line:
                withChr = True

        #update contig information
        elif line.startswith('#CHROM'):
            printlog(["Updating contig field ... "])
            target_gsize = dict(
                list(zip(refFasta.references, refFasta.lengths)))
            for chr_id in sorted(target_gsize):
                if chr_id.startswith('chr'):
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id.replace('chr', ''), target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                else:
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              ('chr' + chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)

            print(
                "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>"
                % __version__,
                file=FILE_OUT)
            print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
            print("##originalFile=<%s>" % infile, file=FILE_OUT)
            print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
            print("##liftOverDate=<%s>" %
                  datetime.date.today().strftime("%B%d,%Y"),
                  file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            printlog(["Lifting over ... "])

        else:
            if line.startswith('#'): continue

            # process non-variant region
            if 'END=' in line:
                fields = str.split(line, maxsplit=8)
                total_region += 1
                chrom = fields[0]
                start = int(fields[1]) - 1  # 0 based
                try:
                    m = re.search(r"END\=(\d+)", line)
                    end = int(m[1])
                except:
                    print(line + "\tFail(Unmap)", file=UNMAP)
                    failed_region += 1
                    continue

                a = map_coordinates(mapping, chrom, start, end, '+')
                if a is None:
                    print(line + "\tFail(Unmap)", file=UNMAP)
                    failed_region += 1
                    continue
                if len(a) == 2:
                    # update chrom
                    target_chr = str(
                        a[1][0]
                    )  #target_chr is from chain file, could be 'chr1' or '1'
                    target_start = a[1][1]
                    target_end = a[1][2]
                    fields[0] = target_chr

                    # update start coordinate
                    fields[1] = target_start + 1

                    # update END
                    fields[7] = fields[7].replace(('END=' + str(end)),
                                                  ('END=' + str(target_end)))
                    print('\t'.join(map(str, fields)), file=FILE_OUT)

            # process variant line
            else:

                fields = str.split(line, maxsplit=7)
                total_var += 1
                chrom = fields[0]
                start = int(fields[1]) - 1  # 0 based, ref_allele start
                end = start + len(fields[3])  # ref_allele end
                alt_allele = fields[4].replace(' ', '').split(
                    ','
                )[0]  # 20  10000598    .   T   A,<NON_REF> 1754.77 .   DP=54;

                a = map_coordinates(mapping, chrom, start, end, '+')
                if a is None:
                    print(line + "\tFail(Unmap)", file=UNMAP)
                    failed_var += 1
                    continue

                if len(a) == 2:
                    # update chrom
                    target_chr = str(
                        a[1][0]
                    )  #target_chr is from chain file, could be 'chr1' or '1'
                    target_start = a[1][1]
                    target_end = a[1][2]
                    fields[0] = target_chr

                    # update start coordinate
                    fields[1] = target_start + 1

                    # update ref allele
                    target_chr = update_chromID(refFasta.references[0],
                                                target_chr)
                    fields[3] = refFasta.fetch(target_chr, target_start,
                                               target_end).upper()

                    if a[1][3] == '-':
                        fields[4] = revcomp_DNA(alt_allele,
                                                True) + ',<NON_REF>'

                    #ref_allele and alt_alele are different
                    if fields[3] != alt_allele:
                        print('\t'.join(map(str, fields)), file=FILE_OUT)
                    else:
                        print(line + "\tFail(REF==ALT)", file=UNMAP)
                        failed_var += 1
                else:
                    print(line + "\tFail(Multiple_hits)", file=UNMAP)
                    failed_var += 1
                    continue
    FILE_OUT.close()
    UNMAP.close()
    printlog(["Total variants:", str(total_var)])
    printlog(["Variants failed to map:", str(failed_var)])
    printlog(["Total non-variant regions:", str(total_region)])
    printlog(["Non-variant regions failed to map:", str(failed_region)])
Ejemplo n.º 10
0
def crossmap_wig_file(mapping,
                      in_file,
                      out_prefix,
                      taget_chrom_size,
                      in_format,
                      binSize=100000):
    '''
	Description
	-----------
	Convert genome coordinates (in wiggle/bigwig format) between assemblies.
	wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html
	bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	in_file : file
		Input file in wig or bigwig format. Both "variableStep" and "fixedStep" wiggle
		lines are supported.

	out_prefix : str
		Prefix of output files.

	taget_chrom_size : dict
		Chromosome size of the target genome assembly. Key is chromosome ID, value is the
		length of the chromosome. Note, the chromosome ID and length information were
		extracted from the chain file, therefore, the chrom_IDs can be with or without
		the leading "chr".

	in_format : str
		Either "wiggle" or "bigwig"

	binSize : int
		The chunk size when reading bigwig file in each iteration.
	'''

    OUT_FILE1 = open(out_prefix + '.bgr', 'w')  # original bgr file
    OUT_FILE2 = open(out_prefix + '.sorted.bgr', 'w')  # sorted bgr file
    OUT_FILE3 = pyBigWig.open(out_prefix + '.bw', "w")  # bigwig file

    chrom_style = 'chr1'

    if in_format.upper() == "WIGGLE":
        logging.info("Liftover wiggle file \"%s\" to bedGraph file \"%s\"" %
                     (in_file, out_prefix + '.bgr'))

        for chrom, start, end, strand, score in wiggleReader(in_file):
            chrom_style = chrom
            maps = map_coordinates(mapping, chrom, start, end, '+')
            if maps is None:
                continue
            if len(maps) == 2:
                print('\t'.join([
                    str(i)
                    for i in [maps[1][0], maps[1][1], maps[1][2], score]
                ]),
                      file=OUT_FILE1)
            else:
                continue
            maps[:] = []
        OUT_FILE1.close()

        logging.info("Merging overlapped entries in bedGraph file")
        for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'):
            print('\t'.join([str(i) for i in (chrom, start, end, score)]),
                  file=OUT_FILE2)
        OUT_FILE2.close()

        os.remove(out_prefix + '.bgr')  #remove .bgr, keep .sorted.bgr

        # make bigwig header
        target_chroms_sorted = []
        for k in sorted(taget_chrom_size.keys()):
            i_chrom = update_chromID(chrom_style, k)
            i_value = taget_chrom_size[k]
            target_chroms_sorted.append((i_chrom, i_value))

        # add bigwig header
        logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw'))
        OUT_FILE3.addHeader(target_chroms_sorted)

        # add entries to bigwig file
        logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw'))
        for line in ireader.reader(out_prefix + '.sorted.bgr'):
            r_chr, r_st, r_end, r_value = line.split()
            OUT_FILE3.addEntries([r_chr], [int(r_st)],
                                 ends=[int(r_end)],
                                 values=[float(r_value)])

        OUT_FILE3.close()

    elif in_format.upper() == "BIGWIG":
        logging.info("Liftover bigwig file %s to bedGraph file %s:" %
                     (in_file, out_prefix + '.bgr'))
        for chrom, start, end, score in bigwigReader(in_file):
            chrom_style = chrom
            maps = map_coordinates(mapping, chrom, start, end, '+')
            try:
                if maps is None: continue
                if len(maps) == 2:
                    print('\t'.join([
                        str(i)
                        for i in [maps[1][0], maps[1][1], maps[1][2], score]
                    ]),
                          file=OUT_FILE1)
                else:
                    continue
            except:
                continue
            maps[:] = []
        OUT_FILE1.close()

        logging.info("Merging overlapped entries in bedGraph file")
        for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'):
            print('\t'.join([str(i) for i in (chrom, start, end, score)]),
                  file=OUT_FILE2)
        OUT_FILE2.close()
        os.remove(out_prefix + '.bgr')  #remove .bgr, keep .sorted.bgr

        logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw'))

        # make bigwig header
        target_chroms_sorted = []
        for k in sorted(taget_chrom_size.keys()):
            i_chrom = update_chromID(chrom_style, k)
            i_value = taget_chrom_size[k]
            target_chroms_sorted.append((i_chrom, i_value))

        # add bigwig header
        OUT_FILE3.addHeader(target_chroms_sorted)

        # add entries to bigwig file
        logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw'))
        for line in ireader.reader(out_prefix + '.sorted.bgr'):
            r_chr, r_st, r_end, r_value = line.split()
            OUT_FILE3.addEntries([r_chr], [int(r_st)], [int(r_end)],
                                 [float(r_value)])
        OUT_FILE3.close()
    else:
        raise Exception("Unknown foramt. Must be 'wiggle' or 'bigwig'")