Ejemplo n.º 1
0
def import_blast(filename):

    fp = file(filename)
    total_lines = sum(1 for row in fp)
    print >>sys.stderr, \
            "Read BLAST file %(filename)s (total %(total_lines)d lines)" % locals()
    fp.seek(0)
    j = 0
    for row in fp:

        j += 1
        if j % 100000 == 0: print >>sys.stderr, j, "read..."
            
        atoms = row.split()
        a, b, bitscore = atoms[0], atoms[1], float(atoms[-1])
        a, b = gene_name(a), gene_name(b)
        if a not in tandem_map or b not in tandem_map:
            continue
        a, b = tandem_map[a], tandem_map[b]

        if a == b: continue
        # keep the best blast hit
        if a not in blast_pool:
            blast_pool[a] = Hit(b, bitscore)
        else:
            blast_pool[a].update(b, bitscore)

    fp.close()
Ejemplo n.º 2
0
def import_blast(filename):

    fp = file(filename)
    total_lines = sum(1 for row in fp)
    print >>sys.stderr, \
            "Read BLAST file %(filename)s (total %(total_lines)d lines)" % locals()
    fp.seek(0)
    j = 0
    for row in fp:

        j += 1
        if j % 100000 == 0: print >> sys.stderr, j, "read..."

        atoms = row.split()
        a, b, bitscore = atoms[0], atoms[1], float(atoms[-1])
        a, b = gene_name(a), gene_name(b)
        if a not in tandem_map or b not in tandem_map:
            continue
        a, b = tandem_map[a], tandem_map[b]

        if a == b: continue
        # keep the best blast hit
        if a not in blast_pool:
            blast_pool[a] = Hit(b, bitscore)
        else:
            blast_pool[a].update(b, bitscore)

    fp.close()
Ejemplo n.º 3
0
def process_tandems(fp_blast, sizes, ranks, tandem):

    fp_blast.seek(0)
    total_lines = sum(1 for row in fp_blast)
    print >>sys.stderr, "Read self BLAST file (total %d lines)" % total_lines
    fp_blast.seek(0)
    j = 0
    for row in fp_blast:

        j += 1
        if j % 100000 == 0:
            print >>sys.stderr, j, "read..."

        atoms = row.split()
        a, b = atoms[:2]
        a, b = gene_name(a), gene_name(b)
        if a not in ranks or b not in ranks:
            continue
        chr_a, rank_a = ranks[a]
        chr_b, rank_b = ranks[b]
        if chr_a == chr_b and abs(rank_a - rank_b) <= Tandem_Nmax:
            tandem.join(a, b)

    tandem_removed = set()  # the filtered gene set
    tandem_map = {}  # unfiltered => filtered
    for tandem_group in tandem:
        longest_gene, longest_size = "", 0
        for gene in tandem_group:
            if gene in sizes:
                gene_size = sizes[gene]
                if gene_size > longest_size:
                    longest_gene, longest_size = gene, gene_size
        for gene in tandem_group:
            tandem_map[gene] = longest_gene
        tandem_removed.add(longest_gene)

    print >>sys.stderr, len(tandem_removed), "genes after tandem removal"

    return tandem_map, tandem_removed
Ejemplo n.º 4
0
def process_tandems(fp_blast, sizes, ranks, tandem):

    fp_blast.seek(0)
    total_lines = sum(1 for row in fp_blast)
    print >>sys.stderr, \
            "Read self BLAST file (total %d lines)" % total_lines
    fp_blast.seek(0)
    j = 0
    for row in fp_blast:

        j += 1
        if j % 100000 == 0: print >> sys.stderr, j, "read..."

        atoms = row.split()
        a, b = atoms[:2]
        a, b = gene_name(a), gene_name(b)
        if a not in ranks or b not in ranks: continue
        chr_a, rank_a = ranks[a]
        chr_b, rank_b = ranks[b]
        if chr_a == chr_b and abs(rank_a - rank_b) <= Tandem_Nmax:
            tandem.join(a, b)

    tandem_removed = set()  # the filtered gene set
    tandem_map = {}  # unfiltered => filtered
    for tandem_group in tandem:
        longest_gene, longest_size = "", 0
        for gene in tandem_group:
            if gene in sizes:
                gene_size = sizes[gene]
                if gene_size > longest_size:
                    longest_gene, longest_size = gene, gene_size
        for gene in tandem_group:
            tandem_map[gene] = longest_gene
        tandem_removed.add(longest_gene)

    print >> sys.stderr, len(tandem_removed), "genes after tandem removal"

    return tandem_map, tandem_removed
Ejemplo n.º 5
0
def load_sizes(fp_sizes):

    # load the gene size info (for keeping longest gene in a tandem group)

    fp_sizes.seek(0)
    sizes = {}  # gene => size
    print >>sys.stderr, "Read .sizes file"
    for row in fp_sizes:
        gene, size = row.split()
        # sizes are calculated for the transcripts, and we keep longest transcript
        gene, size = gene_name(gene), int(size)
        if gene not in sizes or size > sizes[gene]:
            sizes[gene] = size
    return sizes
Ejemplo n.º 6
0
def load_sizes(fp_sizes):

    # load the gene size info (for keeping longest gene in a tandem group)

    fp_sizes.seek(0)
    sizes = {}  # gene => size
    print >> sys.stderr, "Read .sizes file"
    for row in fp_sizes:
        gene, size = row.split()
        # sizes are calculated for the transcripts, and we keep longest transcript
        gene, size = gene_name(gene), int(size)
        if gene not in sizes or size > sizes[gene]:
            sizes[gene] = size
    return sizes