Ejemplo n.º 1
0
 def testMultiple(self):
     """test empty input."""
     self.assertEqual( Intervals.truncate( [(0,5), (10,15)], [(0,5)] ), [(10,15)] )
     self.assertEqual( Intervals.truncate( [(0,5), (10,15)], [(0,10)] ), [(10,15)] )
     self.assertEqual( Intervals.truncate( [(0,5), (10,15)], [(0,15)] ), [] )
     self.assertEqual( Intervals.truncate( [(0,5), (5,10)], [(0,10)] ), [] )
     self.assertEqual( Intervals.truncate( [(0,5), (5,10)], [] ), [(0,5),(5,10)] )
Ejemplo n.º 2
0
 def testNoOverlap(self):
     """test empty input."""
     self.assertEqual(Intervals.truncate([(0, 5), (10, 15)], [(5, 10)]),
                      [(0, 5), (10, 15)])
     self.assertEqual(Intervals.truncate([(5, 10)], [(0, 5), (10, 15)]),
                      [(5, 10)])
     self.assertEqual(Intervals.truncate([(0, 5), (5, 10)], [(10, 15)]),
                      [(0, 5), (5, 10)])
Ejemplo n.º 3
0
 def testMultiple(self):
     """test empty input."""
     self.assertEqual(Intervals.intersect([(0, 5), (10, 15)], [(0, 5)]),
                      [(0, 5)])
     self.assertEqual(Intervals.intersect([(0, 5), (10, 15)], [(0, 10)]),
                      [(0, 5)])
     self.assertEqual(Intervals.intersect([(0, 5), (10, 15)], [(0, 15)]),
                      [(0, 5), (10, 15)])
     self.assertEqual(Intervals.intersect([(0, 5), (5, 10)], [(0, 10)]),
                      [(0, 5), (5, 10)])
Ejemplo n.º 4
0
 def testSingle(self):
     """test empty input."""
     self.assertEqual( Intervals.truncate( [(0,5)], [(0,5)] ), [] )
     self.assertEqual( Intervals.truncate( [(0,5)], [(0,3)] ), [(3,5)] )
     self.assertEqual( Intervals.truncate( [(0,3)], [(0,5)] ), [] )
     self.assertEqual( Intervals.truncate( [(0,5)], [(3,5)] ), [(0,3)] )
     self.assertEqual( Intervals.truncate( [(3,5)], [(0,5)] ), [] )
     self.assertEqual( Intervals.truncate( [(5,10)], [(5,10)] ), [] )
     self.assertEqual( Intervals.truncate( [(5,10)], [(5,20)] ), [] )
     self.assertEqual( Intervals.truncate( [(5,10)], [(0,10)] ), [] )
     self.assertEqual( Intervals.truncate( [(5,10)], [(0,10)] ), [] )
     self.assertEqual( Intervals.truncate( [(5,10)], [(0,20)] ), [] )
Ejemplo n.º 5
0
 def testSingle(self):
     """test empty input."""
     self.assertEqual(Intervals.truncate([(0, 5)], [(0, 5)]), [])
     self.assertEqual(Intervals.truncate([(0, 5)], [(0, 3)]), [(3, 5)])
     self.assertEqual(Intervals.truncate([(0, 3)], [(0, 5)]), [])
     self.assertEqual(Intervals.truncate([(0, 5)], [(3, 5)]), [(0, 3)])
     self.assertEqual(Intervals.truncate([(3, 5)], [(0, 5)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(5, 10)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(5, 20)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(0, 10)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(0, 10)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(0, 20)]), [])
Ejemplo n.º 6
0
Archivo: GTF.py Proyecto: yangjl/cgat
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"):
    """select only those genes with a minimum length of a given feature."""
    for gffs in gff_iterator:
        intervals = [(x.start, x.end) for x in gffs if x.feature == feature]
        intervals = Intervals.combine(intervals)
        t = sum((x[1] - x[0] for x in intervals))
        if t >= min_length: yield gffs
Ejemplo n.º 7
0
Archivo: GTF.py Proyecto: yangjl/cgat
def toSequence(chunk, fasta):
    """convert a list of gff attributes to a single sequence.
    
    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0: return ""

    contig, strand = chunk[0].contig, chunk[0].strand

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand(strand)

    if not positive:
        intervals = [(lcontig - end, lcontig - start)
                     for start, end in intervals]
        intervals.reverse()

    s = [
        fasta.getSequence(contig, strand, start, end)
        for start, end in intervals
    ]

    return "".join(s)
Ejemplo n.º 8
0
def toSequence( chunk, fasta ):
    """convert a list of gff attributes to a single sequence.
    
    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0: return ""

    contig, strand = chunk[0].contig, chunk[0].strand 

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
    
    intervals = Intervals.combine( [ (x.start, x.end) for x in chunk ] )
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand( strand )

    if not positive: 
        intervals = [ (lcontig - end, lcontig - start) for start,end in intervals ]
        intervals.reverse()

    s = [ fasta.getSequence( contig, strand, start, end ) for start, end in intervals ]

    return "".join(s)
Ejemplo n.º 9
0
def iterator_min_feature_length( gff_iterator, min_length, feature="exon" ):
    """select only those genes with a minimum length of a given feature."""
    for gffs in gff_iterator:
        intervals = [ (x.start, x.end) for x in gffs if x.feature == feature ]
        intervals = Intervals.combine( intervals )
        t = sum( ( x[1] - x[0] for x in intervals ) )
        if t >= min_length: yield gffs
Ejemplo n.º 10
0
def toIntronIntervals( chunk ):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will raise an error if more than one transcript is submitted.
    
    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0: return []
    t = set([ x.transcript_id for x in chunk ])
    contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[0].transcript_id
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
        assert gff.transcript_id == transcript_id, "more than one transcript submitted"

    intervals = Intervals.combine( [ (x.start, x.end) for x in chunk ] )
    return Intervals.complement( intervals )
Ejemplo n.º 11
0
Archivo: GTF.py Proyecto: yangjl/cgat
def toIntronIntervals(chunk):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will raise an error if more than one transcript is submitted.
    
    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0: return []
    t = set([x.transcript_id for x in chunk])
    contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[
        0].transcript_id
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
        assert gff.transcript_id == transcript_id, "more than one transcript submitted"

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    return Intervals.complement(intervals)
Ejemplo n.º 12
0
def asRanges( gffs, feature = None ):
    """return ranges within a set of gffs.

    Overlapping intervals are merged. 

    The returned intervals are sorted.
    """

    if type(feature) == types.StringType:
        gg = filter( lambda x: x.feature == feature, gffs )
    elif feature:
        gg = filter( lambda x: x.feature in feature, gffs )
    else:
        gg = gffs[:]
    
    r = [ (g.start, g.end) for g in gg ]
    return Intervals.combine( r )
Ejemplo n.º 13
0
Archivo: GTF.py Proyecto: yangjl/cgat
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged. 

    The returned intervals are sorted.
    """

    if type(feature) == types.StringType:
        gg = filter(lambda x: x.feature == feature, gffs)
    elif feature:
        gg = filter(lambda x: x.feature in feature, gffs)
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
Ejemplo n.º 14
0
    def processFamily( family_id, family_intervals, all_intervals, min_length_domain, query_length ):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine( map( lambda x: (x.mStart, x.mEnd), family_intervals) )
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max( map(lambda x: x.mPid, family_intervals) )
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write("# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage))
                continue

            all_intervals.append( DomainMatch( best_pid, start, end, family_id ) )
Ejemplo n.º 15
0
def main():

    rExists = os.system("which r")
    if rExists == 256:
        print "\n\nYou need to install r first\n\n"
        quit()

    rscriptExists = os.system("which Rscript")
    if rscriptExists == 256:
        print "\n\nYou need to install Rscript first\n\n"
        quit()

    path2files, treesFolder, chromoSizeFile, mappingFile, majorClade, minorsXmajor, majors, criterion, m_interval = get_parameters(
    )

    # Counting minor clades and filtering by criterion
    result_counts = TreesCriteria_counts.count(path2files, treesFolder,
                                               majorClade, mappingFile,
                                               int(criterion))

    print "total genes = " + result_counts.split(",")[0]
    print "total trees = " + result_counts.split(",")[1]

    # Mapping the information
    result_mapIntervals = Intervals.mapIntervals(path2files, m_interval,
                                                 chromoSizeFile)

    print "number of chromosomes: " + str(
        result_mapIntervals['number of chromosomes'])
    print "map size: " + str(result_mapIntervals['map size'])
    print "genes mapped: " + str(result_mapIntervals['genes mapped'])

    # Redistributing the loci that are not clearly in an interval.

    result_mapInfoHelper = MapInfoHelper.redistributeLoci(path2files)
    print result_mapInfoHelper

    result_matrix = BuildMapMatrix.BuildMatrix(path2files, minorsXmajor,
                                               majors, m_interval)

    os.system("rm *.pyc")
Ejemplo n.º 16
0
    def processFamily(family_id, family_intervals, all_intervals,
                      min_length_domain, query_length):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine(
                map(lambda x: (x.mStart, x.mEnd), family_intervals))
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max(map(lambda x: x.mPid, family_intervals))
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write(
                        "# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n"
                        % (family_id, start, end, coverage))
                continue

            all_intervals.append(DomainMatch(best_pid, start, end, family_id))
Ejemplo n.º 17
0
 def testArray2(self):
     """test longer array."""
     a = [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]
     self.assertEqual(Intervals.fromArray(a), [(0, 3), (6, 9), (12, 15)])
     self.assertEqual(Intervals.fromArray([not x for x in a]), [(3, 6),
                                                                (9, 12)])
Ejemplo n.º 18
0
 def testArray1(self):
     """test simple array."""
     a = [1, 1, 1, 0, 0, 0, 1, 1, 1]
     self.assertEqual(Intervals.fromArray(a), [(0, 3), (6, 9)])
     self.assertEqual(Intervals.fromArray([not x for x in a]), [(3, 6)])
Ejemplo n.º 19
0
 def testEmpty(self):
     """test empty input."""
     self.assertEqual(Intervals.fromArray([]), [])
Ejemplo n.º 20
0
    def Finalize( self ):
        """process each sequence and fine tune domain boundaries.
        adds singletons as well.
        """

        nids = self.mTableNids.GetAllNids()        

        if self.mLogLevel >= 1:
            print "--> at the beginning: "
            print "--> domains: %i" % (self.mTableFamilies.RowCount())
            print "--> alignments: %i" % (self.mTableDomains.RowCount())
            print "--> nids: %i" % len(self.mTableDomains.GetAllNids())
            print "--> in %s: %i" % (self.mTableNameSource, len(nids))
            sys.stdout.flush()

        self.OpenOutfiles()
        nsingletons = 0

        known_families = set(self.mTableFamilies.GetAllClasses())
        
        for nid in nids:

            if self.mFilterRepeats:
                repeats = self.mTableDomainsCore.GetDomainBoundaries( nid )
            else:
                repeats = None
            
            domains = list(self.mTableDomains.GetDomainBoundaries(nid))
            length = self.mTableNrdb.GetLength( nid )

            domains.sort()
            all_intervalls = []
            last_family = None
            for family, domain_from, domain_to in domains:

                if last_family != family:
                    if last_family:
                        if self.mCombineOverlaps:
                            i = Intervals.combine( family_intervalls )
                        else:
                            i = family_intervalls
                            
                        all_intervalls += i
                        self.WriteIntervals( last_family, nid, i, repeats)

                    family_intervalls = []
                    
                last_family = family
                family_intervalls.append( (domain_from, domain_to) )

            if last_family:

                if self.mCombineOverlaps:
                    i = Intervals.combine( family_intervalls )
                else:
                    i = family_intervalls
                    
                all_intervalls += i                
                self.WriteIntervals( last_family, nid, i, repeats)

            # remove all domains that overlap with repeats by adding the repeats
            if self.mFilterRepeats:

                for rfamily, rfrom, rto in repeats:
                    all_intervalls.append( (rfrom, rto) )
                    
            # add singletons
            i = Intervals.complement( all_intervalls, 1, length)

            if self.mLogLevel > 3:
                print "nid=%i" % nid, all_intervalls, repeats, domains, i

            for first_res, last_res in i:
                if last_res-first_res > self.mMinSingletonLength:
                    
                    new_family = self.mTableFamilies.GetNewFamily( known_families )
                    self.WriteNewSingleton( new_family, nid, first_res, last_res )
                    nsingletons += 1
                    known_families.add( new_family )
            
        self.CloseOutfiles()

        self.Load()

        if self.mLogLevel >= 1:
            print "--> at the end: "
            print "--> domains: %i" % (self.mTableFamilies.RowCount())
            print "--> alignments: %i" % (self.mTableDomains.RowCount())
            print "--> nids: %i" % len(self.mTableDomains.GetAllNids())
            print "--> singletons added: %i" % nsingletons
            sys.stdout.flush()
Ejemplo n.º 21
0
def buildSCOPDomains( infiles, outfile ):
    '''reconcile mapped domains into a single domain file.

    * fragments are removed - a domain must map at least 90%
      of its length.

    * domains overlapping on the same sequence with the same
      superfamily classification are merged.
    '''
    
    linksfile, fastafile = infiles

    # filtering criteria
    min_coverage = 0.9
    # only take first four fold classes
    classes = 'abcd'

    rx = re.compile('(\S+)\s(\S+)\s(.*)' )
    id2class = {}
    with IOTools.openFile( fastafile ) as inf:
        for x in FastaIterator.iterate( inf ):
            pid, cls, description = rx.match(x.title).groups()
            id2class[pid] = (cls, len(x.sequence) )
            
    E.info('read mappings for %i sequences' % len(id2class))
    counter = E.Counter()

    with IOTools.openFile( linksfile ) as inf:
        nid2domains = collections.defaultdict( list )
        ndomains = 0
        for line in inf:
            if line.startswith('query_nid'): continue
            if line.startswith('#'): continue
            counter.links += 1
            
            domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \
                block_sizes, domain_starts, sbjct_starts, \
                bitscore, pid = line[:-1].split()
            
            nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \
                                                                       ( nid, domain_start, domain_end, sbjct_start, sbjct_end ))

            family, length = id2class[domain_id]

            cls, fold, superfamily, family = family.split('.')
            if cls not in classes: continue
            if float(domain_end - domain_start) / length < min_coverage: continue
            counter.unmerged_domains += 1
            superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily))

            nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) )

        counter.sequences = len(nid2domains)

    E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences))

    outf = IOTools.openFile( outfile, 'w' )
    outf.write('nid\tstart\tend\tfamily\n')
    for nid, dd in sorted(nid2domains.iteritems()):
        for family, domains in itertools.groupby( dd, key = lambda x: x[0] ):
            unmerged_domains = [ (x[1],x[2]) for x in domains ]
            merged_domains = Intervals.combine( unmerged_domains )
            for start, end in merged_domains:
                counter.domains += 1
                outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) )
    outf.close()

    E.info( counter )
Ejemplo n.º 22
0
    def processChunk( query_id, matches ):
        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid 
        global new_family_id, nsingletons, nmapped_domains, nmapped_sequences, nmapped_empty

        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []
        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            ## check for full length matches
            query_coverage = 100.0 * (match.mQueryTo - match.mQueryFrom) / match.mQueryLength

            if query_coverage >= 99.9:
                full_matches.append(match)
            if query_coverage > options.threshold_min_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1
        else:
            nskipped += 1
            return
            
        ## compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append( (match.mQueryFrom, match.mQueryTo) )
        
        rest = Intervals.complement( intervals, 0, match.mQueryLength )
        
        query_coverage = 100.0 * (match.mQueryLength - sum( map( lambda x: x[1] - x[0], rest) ) ) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append( query_id )
        elif  query_coverage > options.threshold_min_query_coverage:
            well_matched.append( query_id )
        else:
            partially_matched.append( query_id )

        aggregate_coverages.append( query_coverage )
        
        new_family_id, mapped_domains, singletons = mapDomains( query_id, matches, map_nid2domains, new_family_id, options )
        
        if len(mapped_domains) > 0:
            nmapped_sequences += 1
        else:
            nmapped_empty += 1
        nmapped_domains += len(mapped_domains)

        mapped_coverage = 100.0 * sum( map( lambda x: x.mEnd - x.mStart, mapped_domains ) ) / match.mQueryLength
        mapped_coverages.append( mapped_coverage )
        
        for domain in mapped_domains:
            options.stdout.write( "\t".join( map( str, (query_id, domain.mStart, domain.mEnd, domain.mFamily) ) ) + "\n" )

        for domain in singletons:
            options.stdout.write( "\t".join( map( str, (query_id, domain.mStart, domain.mEnd, domain.mFamily) ) ) + "\n" )

        noutput += 1
Ejemplo n.º 23
0
 def testArray2(self):
     """test longer array."""
     a = [1,1,1,0,0,0,1,1,1,0,0,0,1,1,1]
     self.assertEqual( Intervals.fromArray( a ), [(0,3), (6,9), (12,15)] )
     self.assertEqual( Intervals.fromArray( [not x for x in a] ), [(3,6), (9,12)] )
Ejemplo n.º 24
0
 def testEmpty(self):
     """test empty input."""
     self.assertEqual( Intervals.fromArray( [] ), [] )
Ejemplo n.º 25
0
 def testNoOverlap(self):
     """test empty input."""
     self.assertEqual( Intervals.intersect( [(0,5), (10,15)], [(5,10)] ), [] )
     self.assertEqual( Intervals.intersect( [(5,10)], [(0,5), (10,15)] ), [] )
Ejemplo n.º 26
0
 def testHalfEmpty(self):
     """test empty input."""
     self.assertEqual( Intervals.intersect( [(0,5)], [] ), [] )
     self.assertEqual( Intervals.intersect( [], [(0,5)] ), [] )
Ejemplo n.º 27
0
    nc.connect((server_ip, 8080))
    nc.send(b"hash")
    newconfighash = nc.recv(64).decode()
    if not confighash == newconfighash:
        nc.send(b"config")
        newconfig = nc.recv(1024).decode()
        nc.close()
        print(newconfighash, newconfig)
        configfile = open("config_client.json", 'w')
        configfile.write(newconfig)
        configfile.close()
    else:
        nc.send(b"END")
        nc.close()


read, write = os.pipe()
os.write(write, str(os.getpid()).encode())
os.close(write)

Intervals.setinterval(checker, 1000, 0)
stream = subprocess.Popen("python main.py", stdin=read)
time.sleep(360)
requests.request("GET",
                 "http://localhost:9000/sh?pid=" + str(os.getpid()),
                 auth=HTTPBasicAuth('user', 'Simple_pass'))
print("Killed")
time.sleep(10)
# at the end of program
Intervals.delinterval(0)
Ejemplo n.º 28
0
 def testHalfEmpty(self):
     """test empty input."""
     self.assertEqual(Intervals.intersect([(0, 5)], []), [])
     self.assertEqual(Intervals.intersect([], [(0, 5)]), [])
Ejemplo n.º 29
0
    def processChunk(query_id, matches):
        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid
        global new_family_id, nsingletons, nmapped_domains, nmapped_sequences, nmapped_empty

        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []
        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            ## check for full length matches
            query_coverage = 100.0 * (match.mQueryTo -
                                      match.mQueryFrom) / match.mQueryLength

            if query_coverage >= 99.9:
                full_matches.append(match)
            if query_coverage > options.threshold_min_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1
        else:
            nskipped += 1
            return

        ## compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append((match.mQueryFrom, match.mQueryTo))

        rest = Intervals.complement(intervals, 0, match.mQueryLength)

        query_coverage = 100.0 * (match.mQueryLength - sum(
            map(lambda x: x[1] - x[0], rest))) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append(query_id)
        elif query_coverage > options.threshold_min_query_coverage:
            well_matched.append(query_id)
        else:
            partially_matched.append(query_id)

        aggregate_coverages.append(query_coverage)

        new_family_id, mapped_domains, singletons = mapDomains(
            query_id, matches, map_nid2domains, new_family_id, options)

        if len(mapped_domains) > 0:
            nmapped_sequences += 1
        else:
            nmapped_empty += 1
        nmapped_domains += len(mapped_domains)

        mapped_coverage = 100.0 * sum(
            map(lambda x: x.mEnd - x.mStart,
                mapped_domains)) / match.mQueryLength
        mapped_coverages.append(mapped_coverage)

        for domain in mapped_domains:
            options.stdout.write("\t".join(
                map(str, (query_id, domain.mStart, domain.mEnd,
                          domain.mFamily))) + "\n")

        for domain in singletons:
            options.stdout.write("\t".join(
                map(str, (query_id, domain.mStart, domain.mEnd,
                          domain.mFamily))) + "\n")

        noutput += 1
Ejemplo n.º 30
0
 def testEmpty(self):
     """test empty input."""
     self.assertEqual( Intervals.truncate( [], [] ), [] )
Ejemplo n.º 31
0
 def testEmpty(self):
     """test empty input."""
     self.assertEqual(Intervals.truncate([], []), [])
Ejemplo n.º 32
0
def mapDomains( query_id, matches, map_nid2domains, new_family_id, options ):
    """map domains onto query_id."""

    if options.loglevel >= 1:
        options.stdlog.write("# attempting to map domains for %s\n" % query_id )
        
        if options.loglevel >= 3:
            for match in matches:
                options.stdlog.write("# match=%s\n" % str(match) )
                nid = match.mNid
                if nid in map_nid2domains:
                    for domain in map_nid2domains[nid]:
                        options.stdlog.write("# domain=%s\n" % str(domain) )                        
                else:
                    options.stdlog.write("# no domains for nid %s\n" % nid )
                    
    mapped_domains = []
    
    class DomainMatch:
        def __init__(self, pid, start, end, family):
            self.mPid = pid
            self.mStart = start
            self.mEnd = end
            self.mFamily = family

        def __str__ (self ):
            return "\t".join(map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily)))

    for match in matches:
        nid = match.mNid
        query_length = match.mQueryLength
        if nid not in map_nid2domains: continue

        match.buildAlignment()
        
        ali = match.mMapSbjct2Query

        for domain in map_nid2domains[nid]:
            
            if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom():
                continue
            
            start = ali.mapRowToCol( domain.mStart, alignlib.RIGHT )
            end   = ali.mapRowToCol( domain.mEnd, alignlib.LEFT )
            assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % (start,end)
            mapped_domains.append( DomainMatch(match.mPid, start, end, domain.mFamily) )

    if options.loglevel >= 1:
        options.stdlog.write( "# nid=%s, length=%i, mapped domains=%i\n" % (query_id, query_length, len(mapped_domains) ) )

    last_family = None

    ## sort by matches by family
    mapped_domains.sort( lambda x, y: cmp( x.mFamily, y.mFamily ))

    ##########################################################
    ##########################################################
    ##########################################################
    ## combine matches from different sources

    def processFamily( family_id, family_intervals, all_intervals, min_length_domain, query_length ):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine( map( lambda x: (x.mStart, x.mEnd), family_intervals) )
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max( map(lambda x: x.mPid, family_intervals) )
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write("# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage))
                continue

            all_intervals.append( DomainMatch( best_pid, start, end, family_id ) )

    last_family = None
    family_intervals = []
    all_intervals = []
    min_length_domain = min( options.min_length_domain, query_length - 10 )

    for domain in mapped_domains:
        if last_family != domain.mFamily:
            processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length )
            family_intervals = []

        last_family = domain.mFamily
        family_intervals.append( domain )

    processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length )
                                  
    if options.loglevel >= 2:
        options.stdlog.write("# %s: before filtering: %i domains\n" % (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## pick the best domains
    all_intervals.sort( lambda x, y: cmp( x.mPid * float(x.mEnd-x.mStart), 
                                          y.mPid * float(y.mEnd - y.mStart)) )
    all_intervals.reverse()
    
    new_intervals = []
    for domain in all_intervals:
        
        overlap = Intervals.calculateOverlap( map( lambda x: (x.mStart,x.mEnd), new_intervals),
                                              [(domain.mStart,domain.mEnd)] )
            
        if overlap > 0:
            continue
        
        new_intervals.append( domain )
        
    all_intervals = new_intervals

    if options.loglevel >= 2:
        options.stdlog.write("# %s: after filtering: %i domains\n" % (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## add singletons
    singletons = []

    if options.add_singletons:
        all_singletons = Intervals.complement( 
            map( lambda x: (x.mStart, x.mEnd), all_intervals), 
            0, query_length)

        for first_res, last_res in all_singletons:
            if last_res-first_res > options.min_length_singletons:
                singletons.append( Domain( 0, first_res, last_res, new_family_id ) )
                new_family_id += 1
            
    return new_family_id, all_intervals, singletons
Ejemplo n.º 33
0
 def testArray1(self):
     """test simple array."""
     a = [1,1,1,0,0,0,1,1,1]
     self.assertEqual( Intervals.fromArray( a ), [(0,3), (6,9)] )
     self.assertEqual( Intervals.fromArray( [not x for x in a] ), [(3,6)] )
Ejemplo n.º 34
0
 def testEmpty(self):
     """test empty input."""
     self.assertEqual(Intervals.intersect([], []), [])
Ejemplo n.º 35
0
def mapDomains(query_id, matches, map_nid2domains, new_family_id, options):
    """map domains onto query_id."""

    if options.loglevel >= 1:
        options.stdlog.write("# attempting to map domains for %s\n" % query_id)

        if options.loglevel >= 3:
            for match in matches:
                options.stdlog.write("# match=%s\n" % str(match))
                nid = match.mNid
                if nid in map_nid2domains:
                    for domain in map_nid2domains[nid]:
                        options.stdlog.write("# domain=%s\n" % str(domain))
                else:
                    options.stdlog.write("# no domains for nid %s\n" % nid)

    mapped_domains = []

    class DomainMatch:
        def __init__(self, pid, start, end, family):
            self.mPid = pid
            self.mStart = start
            self.mEnd = end
            self.mFamily = family

        def __str__(self):
            return "\t".join(
                map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily)))

    for match in matches:
        nid = match.mNid
        query_length = match.mQueryLength
        if nid not in map_nid2domains: continue

        match.buildAlignment()

        ali = match.mMapSbjct2Query

        for domain in map_nid2domains[nid]:

            if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom(
            ):
                continue

            start = ali.mapRowToCol(domain.mStart, alignlib.RIGHT)
            end = ali.mapRowToCol(domain.mEnd, alignlib.LEFT)
            assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % (
                start, end)
            mapped_domains.append(
                DomainMatch(match.mPid, start, end, domain.mFamily))

    if options.loglevel >= 1:
        options.stdlog.write("# nid=%s, length=%i, mapped domains=%i\n" %
                             (query_id, query_length, len(mapped_domains)))

    last_family = None

    ## sort by matches by family
    mapped_domains.sort(lambda x, y: cmp(x.mFamily, y.mFamily))

    ##########################################################
    ##########################################################
    ##########################################################
    ## combine matches from different sources

    def processFamily(family_id, family_intervals, all_intervals,
                      min_length_domain, query_length):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine(
                map(lambda x: (x.mStart, x.mEnd), family_intervals))
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max(map(lambda x: x.mPid, family_intervals))
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write(
                        "# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n"
                        % (family_id, start, end, coverage))
                continue

            all_intervals.append(DomainMatch(best_pid, start, end, family_id))

    last_family = None
    family_intervals = []
    all_intervals = []
    min_length_domain = min(options.min_length_domain, query_length - 10)

    for domain in mapped_domains:
        if last_family != domain.mFamily:
            processFamily(last_family, family_intervals, all_intervals,
                          min_length_domain, query_length)
            family_intervals = []

        last_family = domain.mFamily
        family_intervals.append(domain)

    processFamily(last_family, family_intervals, all_intervals,
                  min_length_domain, query_length)

    if options.loglevel >= 2:
        options.stdlog.write("# %s: before filtering: %i domains\n" %
                             (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## pick the best domains
    all_intervals.sort(lambda x, y: cmp(x.mPid * float(x.mEnd - x.mStart),
                                        y.mPid * float(y.mEnd - y.mStart)))
    all_intervals.reverse()

    new_intervals = []
    for domain in all_intervals:

        overlap = Intervals.calculateOverlap(
            map(lambda x: (x.mStart, x.mEnd), new_intervals),
            [(domain.mStart, domain.mEnd)])

        if overlap > 0:
            continue

        new_intervals.append(domain)

    all_intervals = new_intervals

    if options.loglevel >= 2:
        options.stdlog.write("# %s: after filtering: %i domains\n" %
                             (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## add singletons
    singletons = []

    if options.add_singletons:
        all_singletons = Intervals.complement(
            map(lambda x: (x.mStart, x.mEnd), all_intervals), 0, query_length)

        for first_res, last_res in all_singletons:
            if last_res - first_res > options.min_length_singletons:
                singletons.append(Domain(0, first_res, last_res,
                                         new_family_id))
                new_family_id += 1

    return new_family_id, all_intervals, singletons
Ejemplo n.º 36
0
import numpy as np
import Intervals as vl

test = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

intervalClass = vl.Intervals(test)

Labelings = intervalClass.allLabelings()

count = 0
for labels in Labelings:
    print(str(count) + ". " + str(labels))
    count += 1

choice = int(input("Enter function choice: "))

intervalClass.setBestFunction(choice)

bestInterval = intervalClass.getBestFunction()

for i in test:
    print(bestInterval(i))

print("\n" + str(bestInterval(6)))
Ejemplo n.º 37
0
 def testEmpty(self):
     """test empty input."""
     self.assertEqual( Intervals.intersect( [], [] ), [] )