def BedFileVenn( infiles, outfile ):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip( os.path.basename(liver), ".replicated.bed" )
    testes_name = P.snip( os.path.basename(testes), ".replicated.bed" )
    to_cluster = True
    
    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile( infiles[0], outfile )

    elif len(infiles) == 2:
        
        if P.isEmpty( infiles[0] ) or P.isEmpty( infiles[1] ):
            P.touch( outfile )
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()
        
    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if P.isEmpty( infiles[0] ): 
            P.touch( outfile )
            return
            
        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()
        
        for fn in infiles[1:]:
            if P.isEmpty( infiles[0] ): 
                P.touch( outfile)
                os.unlink( tmpfile )
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run()

        os.unlink( tmpfile )
Esempio n. 2
0
def buildNUMTs(infile, outfile):
    '''build annotation with nuclear mitochondrial sequences.

    map mitochondrial chromosome against genome using
    exonerate
    '''
    if not PARAMS["numts_mitochrom"]:
        E.info("skipping numts creation")
        P.touch(outfile)
        return

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    if PARAMS["numts_mitochrom"] not in fasta:
        E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"])
        P.touch(outfile)
        return

    tmpfile_mito = P.getTempFilename(".")

    statement = '''
    python %(scriptsdir)s/index_fasta.py 
           --extract=%(numts_mitochrom)s
           --log=%(outfile)s.log
           %(genome_dir)s/%(genome)s
    > %(tmpfile_mito)s
    '''

    P.run()

    if P.isEmpty(tmpfile_mito):
        E.warn("mitochondrial genome empty.")
        os.unlink(tmpfile_mito)
        P.touch(outfile)
        return

    format = ("qi", "qS", "qab", "qae",
              "ti", "tS", "tab", "tae",
              "s",
              "pi",
              "C")

    format = "\\\\t".join(["%%%s" % x for x in format])

    # collect all results
    min_score = 100

    statement = '''
    cat %(genome_dir)s/%(genome)s.fasta
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunksize=1 --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(tmpfile_mito)s
              --model affine:local
              --score %(min_score)i
              --showalignment no --showsugar no --showcigar no 
              --showvulgar no
              --ryo \\"%(format)s\\n\\" 
    " 
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    # convert to gtf
    inf = IOTools.openFile("%s.links.gz" % outfile)
    outf = IOTools.openFile(outfile, "w")

    min_score = PARAMS["numts_score"]

    c = E.Counter()

    for line in inf:
        (query_contig, query_strand, query_start, query_end,
         target_contig, target_strand, target_start, target_end,
         score, pid, alignment) = line[:-1].split("\t")

        c.input += 1
        score = int(score)
        if score < min_score:
            c.skipped += 1
            continue

        if target_strand == "-":
            target_start, target_end = target_end, target_start

        gff = GTF.Entry()
        gff.contig = target_contig
        gff.start, gff.end = int(target_start), int(target_end)
        assert gff.start < gff.end

        gff.strand = target_strand
        gff.score = int(score)
        gff.feature = "numts"
        gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        outf.write("%s\n" % str(gff))
        c.output += 1

    inf.close()
    outf.close()

    E.info("filtering numts: %s" % str(c))
Esempio n. 3
0
def buildPseudogenes(infiles, outfile):
    '''annotate genomic regions with reference gene set.

    *infile* is an ENSEMBL gtf file.

    This task selects all pseudogenic transcripts in a single file.

    Pseudogenes are:

    * gene_type or transcript_type contains the phrase "pseudo". This taken from
      the database.

    * feature 'processed_transcript' with similarity to protein coding genes. Similarity
      is assessed by aligning with exonerate.

    Pseudogenic transcripts can overlap with protein coding transcripts.
    '''

    infile_gtf, infile_peptides_fasta = infiles

    tmpfile1 = P.getTempFilename(".")

    statement = '''
    zcat %(infile_gtf)s 
    | awk '$2 ~ /processed/'
    | python %(scriptsdir)s/gff2fasta.py 
            --is-gtf
            --genome-file=%(genome_dir)s/%(genome)s
            --log=%(outfile)s.log
    > %(tmpfile1)s
    '''

    P.run()

    if P.isEmpty(tmpfile1):
        E.warn("no pseudogenes found")
        os.unlink(tmpfile1)
        P.touch(outfile)
        return

    statement = '''
    cat %(tmpfile1)s 
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunksize=100 --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(infile_peptides_fasta)s
              --model protein2dna 
              --bestn 1 
              --score 200
              --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\" 
              --showalignment no --showsugar no --showcigar no --showvulgar no
    " 
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    os.unlink(tmpfile1)

    inf = IOTools.openFile("%s.links.gz" % outfile)
    best_matches = {}
    for line in inf:
        peptide_id, transcript_id, score = line[:-1].split("\t")
        score = int(score)
        if transcript_id in best_matches and best_matches[transcript_id][0] > score:
            continue
        best_matches[transcript_id] = (score, peptide_id)

    inf.close()

    E.info("found %i best links" % len(best_matches))
    new_pseudos = set(best_matches.keys())

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    known_pseudos = set([ x[0] for x in cc.execute("""SELECT DISTINCT transcript_id 
                              FROM transcript_info 
                               WHERE transcript_biotype like '%pseudo%' OR
                                     gene_biotype like '%pseudo%' """ ) ])

    E.info("pseudo processed=%i, known pseudos=%i, intersection=%i" % (
        (len(new_pseudos), len(known_pseudos), len(new_pseudos.intersection(known_pseudos)))))

    all_pseudos = new_pseudos.union(known_pseudos)

    c = E.Counter()

    outf = IOTools.openFile(outfile, "w")
    inf = GTF.iterator(IOTools.openFile(infile_gtf))
    for gtf in inf:
        c.input += 1
        if gtf.transcript_id not in all_pseudos:
            continue
        c.output += 1
        outf.write("%s\n" % gtf)
    outf.close()

    E.info("exons: %s" % str(c))
Esempio n. 4
0
def buildNUMTs( infile, outfile ):
    '''build annotation with nuclear mitochondrial sequences.
    
    map mitochondrial chromosome against genome using
    exonerate
    '''
    if not PARAMS["numts_mitochrom"]:
        E.info( "skipping numts creation" )
        P.touch(outfile)
        return

    fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    if PARAMS["numts_mitochrom"] not in fasta:
        E.warn( "mitochondrial genome %s not found" % PARAMS["numts_mitochrom"] )
        P.touch(outfile)
        return

    tmpfile_mito = P.getTempFilename( ".")

    statement = '''
    python %(scriptsdir)s/index_fasta.py 
           --extract=%(numts_mitochrom)s
           --log=%(outfile)s.log
           %(genome_dir)s/%(genome)s
    > %(tmpfile_mito)s
    '''

    P.run()
    
    if P.isEmpty( tmpfile_mito ):
        E.warn( "mitochondrial genome empty." )
        os.unlink( tmpfile_mito )
        P.touch( outfile )
        return

    format = ("qi", "qS", "qab", "qae", 
              "ti", "tS", "tab", "tae", 
              "s",
              "pi", 
              "C")
    
    format = "\\\\t".join( ["%%%s" % x for x in format] )

    # collect all results
    min_score = 100

    statement = '''
    cat %(genome_dir)s/%(genome)s.fasta
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunksize=1 --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(tmpfile_mito)s
              --model affine:local
              --score %(min_score)i
              --showalignment no --showsugar no --showcigar no 
              --showvulgar no
              --ryo \\"%(format)s\\n\\" 
    " 
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    # convert to gtf
    inf = IOTools.openFile( "%s.links.gz" % outfile )
    outf = IOTools.openFile( outfile, "w" )

    min_score = PARAMS["numts_score"]
    
    c = E.Counter()

    for line in inf:
        (query_contig, query_strand, query_start, query_end,
         target_contig, target_strand, target_start, target_end,
         score, pid, alignment ) = line[:-1].split("\t")

        c.input += 1
        score = int(score)
        if score < min_score: 
            c.skipped += 1
            continue

        if target_strand == "-":
            target_start, target_end = target_end, target_start

        gff = GTF.Entry()
        gff.contig = target_contig
        gff.start, gff.end = int( target_start), int(target_end)
        assert gff.start < gff.end

        gff.strand = target_strand
        gff.score = int(score)
        gff.feature = "numts"
        gff.gene_id = "%s:%s-%s" % (query_contig,query_start,query_end)
        gff.transcript_id = "%s:%s-%s" % (query_contig,query_start,query_end)
        outf.write("%s\n" % str(gff))
        c.output += 1
        
    inf.close()
    outf.close()

    E.info("filtering numts: %s" % str(c))
Esempio n. 5
0
def buildPseudogenes( infiles, outfile ):
    '''annotate genomic regions with reference gene set.

    *infile* is an ENSEMBL gtf file.

    This task selects all pseudogenic transcripts in a single file.

    Pseudogenes are:
    
    * gene_type or transcript_type contains the phrase "pseudo". This taken from
      the database.

    * feature 'processed_transcript' with similarity to protein coding genes. Similarity
      is assessed by aligning with exonerate.

    Pseudogenic transcripts can overlap with protein coding transcripts.
    '''

    infile_gtf, infile_peptides_fasta = infiles

    tmpfile1 = P.getTempFilename( ".")

    statement = '''
    zcat %(infile_gtf)s 
    | awk '$2 ~ /processed/'
    | python %(scriptsdir)s/gff2fasta.py 
            --is-gtf
            --genome-file=%(genome_dir)s/%(genome)s
            --log=%(outfile)s.log
    > %(tmpfile1)s
    '''

    P.run()

    if P.isEmpty( tmpfile1 ):
        E.warn( "no pseudogenes found" )
        os.unlink( tmpfile1 )
        P.touch( outfile )
        return

    statement = '''
    cat %(tmpfile1)s 
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunksize=100 --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(infile_peptides_fasta)s
              --model protein2dna 
              --bestn 1 
              --score 200
              --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\" 
              --showalignment no --showsugar no --showcigar no --showvulgar no
    " 
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    os.unlink( tmpfile1 )

    inf = IOTools.openFile( "%s.links.gz" % outfile )
    best_matches = {}
    for line in inf:
        peptide_id, transcript_id, score = line[:-1].split("\t")
        score = int(score)
        if transcript_id in best_matches and best_matches[transcript_id][0] > score:
            continue
        best_matches[ transcript_id ] = (score, peptide_id )
        
    inf.close()

    E.info( "found %i best links" % len(best_matches) )
    new_pseudos = set(best_matches.keys())
    
    dbhandle = sqlite3.connect( PARAMS["database"] )
    cc = dbhandle.cursor()
    known_pseudos = set([ x[0] for x in cc.execute("""SELECT DISTINCT transcript_id 
                              FROM transcript_info 
                               WHERE transcript_biotype like '%pseudo%' OR
                                     gene_biotype like '%pseudo%' """ ) ])

    E.info( "pseudo processed=%i, known pseudos=%i, intersection=%i" % (
            ( len(new_pseudos), len(known_pseudos), len( new_pseudos.intersection( known_pseudos) ) ) ) )
    
    all_pseudos = new_pseudos.union( known_pseudos )

    c = E.Counter()

    outf = IOTools.openFile( outfile, "w" )
    inf = GTF.iterator( IOTools.openFile( infile_gtf ) )
    for gtf in inf:
        c.input += 1
        if gtf.transcript_id not in all_pseudos:
            continue
        c.output += 1
        outf.write( "%s\n" % gtf )
    outf.close()
        
    E.info( "exons: %s" % str(c))
def BedFileVenn(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip(os.path.basename(liver), ".replicated.bed")
    testes_name = P.snip(os.path.basename(testes), ".replicated.bed")
    to_cluster = True

    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if P.isEmpty(infiles[0]) or P.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if P.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if P.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)