Example #1
0
#!/usr/bin/env python
"""Remove duplicates in a fasta file"""
import sys
from dp.associations import GeneAssociations
from dp.ontology import Ontology
from collections import Counter
from dp.utils import parseFasta
seqs = set()
names = set()
fastafile = open(sys.argv[1])

MIN_SEQ_LEN = 32
MAX_SEQ_UNK = 0.1

TAXONS_HOMMO_SAPIENS = {9606}
asoc = GeneAssociations.fromFile(sys.argv[2], taxons = TAXONS_HOMMO_SAPIENS)
ontology = Ontology(sys.argv[3])
ontology.setAssociations(asoc)
asoc.transitiveClosure()
associated = set()
for k,v in asoc.associations.items():
    associated.update({g.upper() for g in v})

ss = dict(parseFasta("data/ss.txt"))
#print(associated)

for l in fastafile:
    name, typ, *_ = l[1:].split(" ")
    name = name.upper()
    seq = next(fastafile)
    sskey = "%s:secstr" % name.replace("_",":")
Example #2
0
    dataset = None
    if options.dataset:
        # FIXME: When dataset is changed, serialized associations need to be regenerated. This is serious bug if we don't seed random to a constant
        dataset = [l.strip() for l in open(options.dataset)]
        random.shuffle(dataset)
        #assert options.reserve > 0.0
        #if options.reserve < 1.0: # Use ratio
        #    splitIndex = int(options.reserve * len(dataset))
        #else:
        #    splitIndex = int(options.reserve)
        #reserved = set(dataset[:splitIndex])
        #dataset = set(dataset[splitIndex:])
        dataset = set(dataset)

    associations = GeneAssociations.fromFile(associationsFileName,
                                             taxons=TAXONS,
                                             dataset=dataset)
    #reservedAssociations = GeneAssociations.fromFile(associationsFileName+"_reserved", dataset = reserved)
    ontology.setAssociations(associations)
    #ontology.setAssociations(reservedAssociations, 'reserved')

    if options.associationsDump:
        associations.serialize(options.associationsDump)
        #reservedAssociations.serialize(options.associationsDump+"_reserved")
        sys.exit()

    ontology.deleteSmallTerms(options.lb)
    associations.shrink(options.max, options.lb)

    ontology.overView()
    ontology.dotExport()
Example #3
0
#!/usr/bin/env python
"""Remove duplicates in a fasta file"""
import sys
from dp.associations import GeneAssociations
from dp.ontology import Ontology
from collections import Counter
from dp.utils import parseFasta
seqs = set()
names = set()
fastafile = open(sys.argv[1])

MIN_SEQ_LEN = 32
MAX_SEQ_UNK = 0.1

TAXONS_HOMMO_SAPIENS = {9606}
asoc = GeneAssociations.fromFile(sys.argv[2], taxons=TAXONS_HOMMO_SAPIENS)
ontology = Ontology(sys.argv[3])
ontology.setAssociations(asoc)
asoc.transitiveClosure()
associated = set()
for k, v in asoc.associations.items():
    associated.update({g.upper() for g in v})

ss = dict(parseFasta("data/ss.txt"))
#print(associated)

for l in fastafile:
    name, typ, *_ = l[1:].split(" ")
    name = name.upper()
    seq = next(fastafile)
    sskey = "%s:secstr" % name.replace("_", ":")
Example #4
0
    dataset = None
    if options.dataset:
        # FIXME: When dataset is changed, serialized associations need to be regenerated. This is serious bug if we don't seed random to a constant
        dataset = [l.strip() for l in open(options.dataset)]
        random.shuffle(dataset)
        #assert options.reserve > 0.0
        #if options.reserve < 1.0: # Use ratio
        #    splitIndex = int(options.reserve * len(dataset))
        #else:
        #    splitIndex = int(options.reserve)
        #reserved = set(dataset[:splitIndex])
        #dataset = set(dataset[splitIndex:])
        dataset = set(dataset)

    associations = GeneAssociations.fromFile(associationsFileName, taxons = TAXONS, dataset = dataset)
    #reservedAssociations = GeneAssociations.fromFile(associationsFileName+"_reserved", dataset = reserved)
    ontology.setAssociations(associations)
    #ontology.setAssociations(reservedAssociations, 'reserved')
 
    if options.associationsDump:
        associations.serialize(options.associationsDump)
        #reservedAssociations.serialize(options.associationsDump+"_reserved")
        sys.exit()

    ontology.deleteSmallTerms(options.lb)
    associations.shrink(options.max, options.lb)
    
    ontology.overView()
    ontology.dotExport()