def test_acba_simple_with_gbk_without_promoter(self):
        replicon_filename = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        command = "integron_finder --outdir {out_dir} --gbk {replicon}".format(out_dir=self.out_dir,
                                                                               replicon=self.find_data(
                                                                                   os.path.join('Replicons',
                                                                                                '{}.fst'.format(replicon_filename))
                                                                                )
                                                                               )

        with self.catch_io(out=True, err=True):
            main(command.split()[1:], loglevel='WARNING')

        output_dirname = 'Results_Integron_Finder_{}'.format(replicon_filename)
        test_result_dir = os.path.join(self.out_dir, output_dirname)
        gbk = '{}.gbk'.format(replicon_id)
        expected_gbk = self.find_data(os.path.join(output_dirname + ".wo_promoter", gbk))
        gbk_test = os.path.join(test_result_dir, gbk)
        expected_gbk = SeqIO.read(expected_gbk, 'gb')
        gbk_test = SeqIO.read(gbk_test, 'gb')
        self.assertSeqRecordEqual(expected_gbk, gbk_test)

        output_filename = '{}.integrons'.format(replicon_filename)
        expected_result_path = self.find_data(os.path.join(output_dirname + ".wo_promoter", output_filename))
        test_result_path = os.path.join(test_result_dir, output_filename)
        self.assertIntegronResultEqual(expected_result_path, test_result_path)
    def test_acba_annot(self):
        replicon_filename = 'acba.007.p01.13'
        replicon_id = 'ACBA.007.P01_13'
        command = "integron_finder --outdir {out_dir} --func-annot --path-func-annot {annot_bank} --promoter-attI " \
                  "--gbk --keep-tmp " \
                  "{replicon}".format(out_dir=self.out_dir,
                                      annot_bank=self.resfams_dir,
                                      replicon=self.find_data(os.path.join('Replicons', '{}.fst'.format(replicon_filename)))
                                      )

        with self.catch_io(out=True, err=False):
            main(command.split()[1:], loglevel='WARNING')

        result_dir = os.path.join(self.out_dir, 'Results_Integron_Finder_{}'.format(replicon_filename))

        gbk = '{}.gbk'.format(replicon_id)
        expected_gbk = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), gbk))
        gbk_test = os.path.join(result_dir, gbk)
        expected_gbk = SeqIO.read(expected_gbk, 'gb')
        gbk_test = SeqIO.read(gbk_test, 'gb')
        self.assertSeqRecordEqual(expected_gbk, gbk_test)

        output_filename = '{}.integrons'.format(replicon_filename)
        expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename),
                                                           output_filename))
        test_result_path = os.path.join(result_dir, output_filename)
        self.assertIntegronResultEqual(expected_result_path, test_result_path)

        output_filename = os.path.join('tmp_{}'.format(replicon_id), replicon_id + '_Resfams_fa_table.res')
        expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename),
                                                           output_filename))
        test_result_path = os.path.join(result_dir, output_filename)
        self.assertHmmEqual(expected_result_path, test_result_path)
    def test_longer_locus_line(self):
        """Check that we can read and write files with longer locus lines."""
        # Create example file from existing file
        with open(path.join("GenBank", "DS830848.gb"), 'r') as inhandle:
            data = inhandle.readlines()
            data[0] = "LOCUS       AZZZAA021234567891234 2147483647 bp    DNA     linear   PRI 15-OCT-2018\n"

        # Create memory file from modified genbank file
        in_tmp = StringIO()
        in_tmp.writelines(data)
        in_tmp.seek(0)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            in_tmp.seek(0)
            record = SeqIO.read(in_tmp, 'genbank')

            # Create temporary output memory file
            out_tmp = StringIO()
            SeqIO.write(record, out_tmp, 'genbank')

            # Check that the written file can be read back in
            out_tmp.seek(0)
            record_in = SeqIO.read(out_tmp, 'genbank')
            self.assertEqual(record_in.id, "DS830848.1")
            self.assertEqual(record_in.name, "AZZZAA021234567891234")
            self.assertEqual(len(record_in.seq), 2147483647)
 def compress(self,filename,cd,pos):
     filename.compdeep=cd-1
     filename.comptype=pos[0:len(pos)-1]
     if filename.ext=='.gb':
         rec=SeqIO.read(filename.get_name(),"genbank")
         ln=len(rec.seq)
     else:
         rec=SeqIO.read(filename.get_name(),"fasta")
         ln=len(rec.seq)
     filename.compdeep=cd
     filename.comptype=pos
     numpos=int(pos[len(pos)-1])
     compstep=self.compopt['compstep']
     resseq=Seq('',rec.seq.alphabet)
     res=open(filename.get_name(),'w')
     oligolist=[]
     self.complete_oligolist(oligolist,'',compstep)
     for i in xrange(0,ln-ln%compstep,compstep):
         if str(rec.seq[i:i+compstep]).lower() in oligolist:
             resseq+=rec.seq[i:i+compstep][numpos]
     rec.seq=resseq
     if filename.ext=='.gb':
         SeqIO.write(rec,res,"genbank")
     else:
         SeqIO.write(rec,res,"fasta")
     res.close()
     return resseq
def collect_proteomes_and_annotaitons(input_dir):
    proteomes = []
    annotations = []

    files = listdir(input_dir)
    if not files:
        interrupt('Directory contains no files.')

    for f in (join(input_dir, f) for f in files if isfile(join(input_dir, f))):
        if '.' in f and splitext(f)[1] in ['.fasta', '.faa', '.fa', '.fsa']:
            try:
                log.debug('   Checking if %s is fasta.' % f)
                next(SeqIO.parse(f, 'fasta'))
            except ValueError, e:
                pass
            else:
                proteomes.append(f)
                continue

        if '.' in f and splitext(f)[1] in ['.gb', '.genbank', '.gbk']:
            try:
                log.debug('   Checking if %s is genbank.' % f)
                SeqIO.read(f, 'genbank')
            except Exception, e:
                log.debug(str(e) + ', ' + f)
            else:
                annotations.append(f)
Exemple #6
0
 def testRemovalOfSuffix(self):
     """
     A sequence that is a suffix of another is removed.
     """
     s1 = SeqIO.read(StringIO('>s1\nagtcagtcagtc'), 'fasta')
     s2 = SeqIO.read(StringIO('>s2\ncagtc'), 'fasta')
     self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2])), [s1])
Exemple #7
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-a", metavar="EMBL-a", help="First EMBL file", action="store", type="string", dest="first_embl")
    parser.add_option("-b", metavar="EMBL-b", help="Second EMBL file to compare", action="store", dest="second_embl")
    parser.add_option("--merge", help="To transfer /product of identical annotations into a merged file", action="store_true", dest="merge")
    
    (options, args) = parser.parse_args()

    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    
    first_record = SeqIO.read(open(options.first_embl), "embl")
    second_record = SeqIO.read(open(options.second_embl), "embl")

    print "Analysis of EMBL features A from %s" % options.first_embl
    print "Analysis of EMBL features B from %s" % options.second_embl

    stat(first_record)
    
    if options.merge:
        merged_record = transfer(first_record, second_record)
        # Write out genbank file
        SeqIO.write([merged_record], open("merged.embl", "w"), "embl")
 def test_overlapping_clip(self):
     with open("Roche/greek.sff", "rb") as handle:
         record = next(SeqIO.parse(handle, "sff"))
     self.assertEqual(len(record), 395)
     s = str(record.seq.lower())
     # Apply overlapping clipping
     record.annotations['clip_qual_left'] = 51
     record.annotations['clip_qual_right'] = 44
     record.annotations['clip_adapter_left'] = 50
     record.annotations['clip_adapter_right'] = 75
     self.assertEqual(len(record), 395)
     self.assertEqual(len(record.seq), 395)
     # Save the clipped record...
     h = BytesIO()
     count = SeqIO.write(record, h, "sff")
     # Now reload it...
     h.seek(0)
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always", BiopythonParserWarning)
         record = SeqIO.read(h, "sff")
         self.assertEqual(len(w), 1, w)
     self.assertEqual(record.annotations['clip_qual_left'], 51)
     self.assertEqual(record.annotations['clip_qual_right'], 44)
     self.assertEqual(record.annotations['clip_adapter_left'], 50)
     self.assertEqual(record.annotations['clip_adapter_right'], 75)
     self.assertEqual(len(record), 395)
     self.assertEqual(s, str(record.seq.lower()))
     # And check with trimming applied...
     h.seek(0)
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always", BiopythonParserWarning)
         record = SeqIO.read(h, "sff-trim")
         self.assertEqual(len(w), 1, w)
     self.assertEqual(len(record), 0)
 def test_structured_comment_parsing(self):
     """Structued comment parsing."""
     # GISAID_EpiFlu(TM)Data, HM138502.gbk has both 'comment' and 'structured_comment'
     record = SeqIO.read(path.join('GenBank', 'HM138502.gbk'), 'genbank')
     self.assertEqual(record.annotations['comment'],
                      "Swine influenza A (H1N1) virus isolated during human swine flu\noutbreak of 2009.")
     self.assertEqual(record.annotations['structured_comment']['GISAID_EpiFlu(TM)Data']['Lineage'], 'swl')
     self.assertEqual(len(record.annotations['structured_comment']['GISAID_EpiFlu(TM)Data']), 3)
     with open(path.join('GenBank', 'HM138502_output.gbk'), "r") as ifile:
         self.assertEqual(record.format("gb"), ifile.read())
     # FluData structured comment
     record = SeqIO.read(path.join('GenBank', 'EU851978.gbk'), 'genbank')
     self.assertEqual(record.annotations['structured_comment']['FluData']['LabID'], '2008704957')
     self.assertEqual(len(record.annotations['structured_comment']['FluData']), 5)
     with open(path.join('GenBank', 'EU851978_output.gbk'), "r") as ifile:
         self.assertEqual(record.format("gb"), ifile.read())
     # Assembly-Data structured comment
     record = SeqIO.read(path.join('GenBank', 'KF527485.gbk'), 'genbank')
     self.assertEqual(record.annotations['structured_comment']['Assembly-Data']['Assembly Method'], 'Lasergene v. 10')
     self.assertEqual(len(record.annotations['structured_comment']['Assembly-Data']), 2)
     with open(path.join('GenBank', 'KF527485_output.gbk'), "r") as ifile:
         self.assertEqual(record.format("gb"), ifile.read())
     # No structured comment in NC_000932.gb, just a regular comment
     record = SeqIO.read(path.join('GenBank', 'NC_000932.gb'), 'genbank')
     self.assertFalse("structured_comment" in record.annotations)
     self.assertEqual(record.annotations['comment'],
                      'REVIEWED REFSEQ: This record has been curated by NCBI staff. The\n'
                      'reference sequence was derived from AP000423.\n'
                      'COMPLETENESS: full length.')
Exemple #10
0
def load_file(filename, file_format="fasta"):
    """
    Load sequence from file and returns sequence as Bio.Seq object
    :param filename:     String; Path and filename of input sequence file
    :param file_format:  String; Format to be used. Refer to Biopython docs for available formats. Defaults to 'fasta'
    """
    content = None
    try:
        # assume sequence is DNA
        content = SeqIO.read(filename, file_format, IUPAC.ambiguous_dna)
    except ValueError as error:
        # if this fails, try RNA instead
        print('ERROR: {}'.format(error))
        try:
            content = SeqIO.read(filename, file_format, IUPAC.ambiguous_rna)
        except ValueError as error:
            # if this fails, too, raise exception and exit with error code 1
            print('ERROR: {}'.format(error))
            exit(1)

    # if some kind of data could be read, return the sequence object
    if content:
        seq = content.seq
        return seq
    # else return None
    else:
        return None
Exemple #11
0
def generateReads(refGene):
    currentChr = refGene[0][2]
    sequence=SeqIO.read('%s/%s.fa'%(options.chromosomes,currentChr),'fasta')
    print "Generating reads for chromosome " + sequence.description
    f=open(options.output,'w')
    countChr = 0
    for gene in refGene:
        if gene[2] != currentChr:
            print '%d reads generated for chromosome %s'%(countChr,currentChr)
            currentChr = gene[2]
            sequence=SeqIO.read('%s/%s.fa'%(options.chromosomes,currentChr),'fasta')
            countChr=0
            print "Generating reads for chromosome " + sequence.description
        strand = gene[3]
        numExons = int(gene[8])
        exonStarts = gene[9].split(',')
        exonEnds = gene[10].split(',')
        exons = []
        for i in range(numExons):
            exons.append([int(exonStarts[i]),int(exonEnds[i])])
        exons.sort(key=lambda e:(e[0],e[1]))
        if options.mode=='au':
            countChr += adjUni(exons,strand,sequence,f)
        elif options.mode=='an':
            pass
        elif options.mode=='cu':
            pass
        elif options.mode=='cn':
            pass
    f.close()
    def test_genbank_date_list(self):
        """Check if date lists are handled correctly"""

        sequence_object = Seq("ATGC", generic_dna)
        record = SeqRecord(sequence_object,
                           id='123456789',
                           name='UnitTest',
                           description='Test case for date parsing')
        record.annotations["date"] = ["24-DEC-2015"]
        handle = StringIO()
        SeqIO.write(record, handle, 'genbank')
        handle.seek(0)
        gb = SeqIO.read(handle, "gb")
        self.assertEqual(gb.annotations["date"], "24-DEC-2015")

        record = SeqRecord(sequence_object,
                           id='123456789',
                           name='UnitTest',
                           description='Test case for date parsing')
        record.annotations["date"] = ["24-DEC-2015", "25-JAN-2016"]
        handle = StringIO()
        SeqIO.write(record, handle, 'genbank')
        handle.seek(0)
        gb = SeqIO.read(handle, "gb")
        self.assertEqual(gb.annotations["date"], "01-JAN-1980")
Exemple #13
0
def process(filename,compopt={'compdeep':3,'compstep':3,'comptype':[0,1,2]},oligox=[1,2,3]):
    """
    compopt={'compdeep':compdeep,'compstep':compstep,'comptype':comptype};
    comptype=[posi,poj...]
    """
      
    if filename.rpartition('.gb')[0]!='':
        path=filename.rpartition('/')[0]+'/'+filename.rpartition('/')[2].rpartition('.gb')[0]+'_semantix'
        reppath=path+'/report'
        inputfile=reppath+'/'+filename.rpartition('/')[2].rpartition('.gb')[0]+'.gb'
    elif filename.rpartition('.fas')[0]!='':
        path=filename.rpartition('/')[0]+'/'+filename.rpartition('/')[2].rpartition('.fas')[0]+'_semantix'
        reppath=path+'/report'
        inputfile=reppath+'/'+filename.rpartition('/')[2].rpartition('.fas')[0]+'.fas'
    inputfile=xfname(inputfile,compopt['compstep'],0,'')
    try:
        os.mkdir(path)
    except:
        pass
    
    try:
        os.mkdir(reppath)
    except:
        pass
##        print inputfile.get_name()
##        print inputfile.get_name().rpartition('/')[2]
    

    linklist=[]
    
    if inputfile.get_name().rpartition('/')[2] not in os.listdir(reppath):
        shutil.copy2(filename,inputfile.get_name())

    if inputfile.ext=='.gb':
##        print inputfile.get_name()
        data=SeqIO.read(inputfile.get_name(),"genbank").seq
    else:
        data=SeqIO.read(inputfile.get_name(),"fasta").seq
##    print oligox
    form_linklist(linklist,inputfile)
    for o in oligox:
##        print o
        Tez,Tez_rev=get_Tez(data,o)
        ramka,signs=get_matrix(Tez,Tez_rev)
        infotab=[['oligonucleotide:',str(o)+'-plet'],['compress level:',inputfile.get_level()]]
        html_write(infotab,ramka,o,inputfile,linklist)
        
    for cd in xrange(1,compopt['compdeep']+1):
        poslist=[]
        get_pos_on_level(poslist,'',cd,compopt)
        for pos in  poslist:
            data=compress(inputfile,cd,pos,compopt)
            form_linklist(linklist,inputfile)
            for o in oligox:
##                print o
                Tez,Tez_rev=get_Tez(data,o)
                ramka,signs=get_matrix(Tez,Tez_rev)
                infotab=[['oligonucleotide:',str(o)+'-plet'],['compress level:',inputfile.get_level()]]
                html_write(infotab,ramka,o,inputfile,linklist)
    form_report(path,linklist,oligox,compopt)
Exemple #14
0
 def test_genbank_to_fasta(self):
     """Conversion of GenBank to FASTA."""
     filename = "GenBank/NC_005816.gb"
     old = SeqIO.read(filename, "gb")
     with open(filename) as handle:
         new = SeqIO.read(TogoWS.convert(handle, "genbank", "fasta"), "fasta")
     self.assertEqual(str(old.seq), str(new.seq))
Exemple #15
0
    def __init__(self, fname = 'data/H3N2_gisaid_epiflu_sequence.fasta',
                 out_specs={'data_dir':'data/', 'prefix':'H3N2_', 'qualifier':''},
                 **kwargs):
        super(flu_process, self).__init__()
        self.fname = fname
        self.kwargs = kwargs
        self.out_specs = out_specs
        if 'outgroup' in kwargs:
            outgroup_file = kwargs['outgroup']
        else:
            outgroup_file = 'source_data/'+out_specs['prefix']+'outgroup.gb'
        tmp_outgroup = SeqIO.read(outgroup_file, 'genbank')
        self.outgroup = tmp_outgroup.features[0].qualifiers['strain'][0]
        genome_annotation = tmp_outgroup.features
        ref_seq = SeqIO.read(outgroup_file, 'genbank')
        self.proteins = {f.qualifiers['gene'][0]:FeatureLocation(start=f.location.start, end=f.location.end, strand=1)
                for f in ref_seq.features if 'gene' in f.qualifiers and f.qualifiers['gene'][0] in ['SigPep', 'HA1', 'HA2']}

        self.time_interval = [datetime.strptime('2008-01-01', "%Y-%m-%d").date(),
                              datetime.strptime('2016-01-01', "%Y-%m-%d").date()]
        self.frequencies = defaultdict(dict)
        self.pivots = np.linspace(num_date(self.time_interval[0]),
                                  num_date(self.time_interval[1]),40)

        self.seqs = sequence_set(self.fname, reference=self.outgroup)
        self.seqs.ungap()
        self.seqs.parse({0:'strain', 1:'isolate_id', 3:'passage', 5:'date', 7:'lab', 8:"accession"}, strip='_')
        self.fix_strain_names()
        self.seqs.raw_seqs[self.outgroup].seq=tmp_outgroup.seq
        self.seqs.raw_seqs['A/Beijing/32/1992'].attributes['date']='1992-01-01'
        self.seqs.reference = self.seqs.raw_seqs[self.outgroup]
        self.seqs.parse_date(["%Y-%m-%d"], prune=True)
        self.geo_parse()
        self.filenames()
   def get_genome_file(self,especie=0,idn=0):#cria o ficheiro e da o nome a funcao anterior para retirar a sequencia do ficheiro
       return_filename=""        
       
       if idn==0:
           hand=Entrez.esearch(db='nucleotide',term=especie+"[ORGN]",retmax=100,retype="gb",retmode="text")
           results=Entrez.read(hand)
           idnum=results["IdList"][0]#primeiro elemento da lista de resultados
           print idnum
           
           handle=Entrez.efetch(db='nucleotide',rettype="fasta",retmode="text",id=idnum)
    #       record=Entrez.read(handle)
           #print(record[0].keys())
           #print (record[0]["TSeq_defline"])
           #print handle.read()
           
           
           read=SeqIO.read(handle,"fasta")
           name="genome_"+str(especie).strip(" ")+".fasta"
           SeqIO.write(read,name, "fasta")
           handle.close()
           return_filename+=name
 ### #         record = SeqIO.read("genome_escherichia coli.fasta", "fasta")
 ###  #        print record             
          # filename = "genome_"+especie+".gb"
       
       elif especie==0:
           handle=Entrez.efetch(db='nucleotide',rettype="fasta",retmode="text",id=idn)
           read=SeqIO.read(handle,"fasta")
           name="genome_"+str(idn).strip(" ")+".fasta"
           SeqIO.write(read,name,"fasta")
           handle.close()
           return_filename+=name
       #print return_filename
       return return_filename
 def get_raw_check(self, filename, format, alphabet):
     handle = open(filename, "rb")
     raw_file = handle.read()
     handle.close()
     #Also checking the key_function here
     id_list = [rec.id.lower() for rec in \
                SeqIO.parse(filename, format, alphabet)]
     rec_dict = SeqIO.index(filename, format, alphabet,
                            key_function = lambda x : x.lower())
     self.assertEqual(set(id_list), set(rec_dict.keys()))
     self.assertEqual(len(id_list), len(rec_dict))
     for key in id_list:
         self.assertTrue(key in rec_dict)
         self.assertEqual(key, rec_dict[key].id.lower())
         self.assertEqual(key, rec_dict.get(key).id.lower())
         raw = rec_dict.get_raw(key)
         self.assertTrue(raw.strip())
         self.assertTrue(raw in raw_file)
         rec1 = rec_dict[key]
         #Following isn't very elegant, but it lets me test the
         #__getitem__ SFF code is working.
         if format in SeqIO._BinaryFormats:
             handle = BytesIO(raw)
         else:
             handle = StringIO(_bytes_to_string(raw))
         if format == "sff":
             rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                         rec_dict._proxy._flows_per_read,
                         rec_dict._proxy._flow_chars,
                         rec_dict._proxy._key_sequence,
                         rec_dict._proxy._alphabet,
                         trim=False)
         elif format == "sff-trim":
             rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                         rec_dict._proxy._flows_per_read,
                         rec_dict._proxy._flow_chars,
                         rec_dict._proxy._key_sequence,
                         rec_dict._proxy._alphabet,
                         trim=True)
         elif format == "uniprot-xml":
             self.assertTrue(raw.startswith(_as_bytes("<entry ")))
             self.assertTrue(raw.endswith(_as_bytes("</entry>")))
             #Currently the __getitem__ method uses this
             #trick too, but we hope to fix that later
             raw = """<?xml version='1.0' encoding='UTF-8'?>
             <uniprot xmlns="http://uniprot.org/uniprot"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://uniprot.org/uniprot
             http://www.uniprot.org/support/docs/uniprot.xsd">
             %s
             </uniprot>
             """ % _bytes_to_string(raw)
             handle = StringIO(raw)
             rec2 = SeqIO.read(handle, format, alphabet)
         else:
             rec2 = SeqIO.read(handle, format, alphabet)
         self.assertEqual(True, compare_record(rec1, rec2))
     rec_dict._proxy._handle.close() #TODO - Better solution
     del rec_dict
Exemple #18
0
def genebank_extract_exon(p_genbank, p_genome = None, p_output = None):
    '''
    Extract exons from genbank file
    :param: p_genbank path to the genbank file
    :param: p_output path to the fasta output file containing tRNA and rRNA

    :return:
    '''

    if p_output == None:
        p_output = os.path.basename(p_genbank)+'.exon'


    genome=SeqIO.read(p_genbank,'genbank')

    if p_genome is None:
        full_seq = genome.seq
    else:
        genome_fasta = SeqIO.read(p_genome, 'fasta')
        full_seq = genome_fasta.seq


    fasta_format = '>{type}|{genome}|position={start}-{stop}:{strand}|locus={locus}|gene={gene}|product={product}\n{seq}\n'

    fout = open(p_output, 'w')
    nb_sequence = 0
    for gene in genome.features:
        if gene.type in ['CDS']:
            d_info = {'type' : gene.type, 'genome':genome.id}

            d_info['seq']       = gene.extract(full_seq)
            d_info['start']     = gene.location.start.position
            d_info['stop']      = gene.location.end.position
            d_info['strand']    = gene.location.strand

            # some gene, like pseudo gene, transposon do not have product
            # We do not take pseudogene
            if 'product' not in gene.qualifiers:
                if 'pseudogene' not in ''.join(gene.qualifiers['note']):
                    d_info['product'] = ''
                else:
                    continue # we do not take pseudogene

            else:
                d_info['product']  = ','.join(gene.qualifiers['product'])



            d_info['gene']      = ','.join(gene.qualifiers['gene'])
            d_info['locus']     = ','.join(gene.qualifiers['locus_tag'])

            fout.write(fasta_format.format(**d_info))

            nb_sequence += 1

    fout.close()
    print('%d exons have been extracted in %s'%(nb_sequence ,p_output))

    return None
Exemple #19
0
def load_HXB2(cropped=False, fragment=None, trim_primers=False):
    '''Load HXB2 reference sequence'''
    if fragment is None:
        return SeqIO.read(get_HXB2_entire(cropped=cropped), 'fasta')
    else:
        return SeqIO.read(get_HXB2_fragmented(fragment,
                                              trim_primers=trim_primers),
                          'fasta')
Exemple #20
0
def load_NL43(fragment=None, trim_primers=False):
    '''Load NL4-3 reference sequence'''
    if fragment is None:
        return SeqIO.read(get_NL43_entire(), 'fasta')
    else:
        return SeqIO.read(get_NL43_fragmented(fragment,
                                              trim_primers=trim_primers),
                          'fasta')
Exemple #21
0
def load_F10(fragment=None):
    '''Load F10 reference sequence'''
    if fragment is None:
        return SeqIO.read(get_F10_entire(), 'fasta')
    else:
        return SeqIO.read(get_F10_fragmented(fragment,
                                             trim_primers=trim_primers),
                          'fasta')
 def testOrderIndependent(self):
     """
     A sequence that is a prefix of another is removed when it appears
     first.
     """
     s1 = SeqIO.read(StringIO('>s1\nagtcag'), 'fasta')
     s2 = SeqIO.read(StringIO('>s2\nagtcagtcagtc'), 'fasta')
     self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2])), [s2])
Exemple #23
0
 def testRemovalOfIdenticalSequences(self):
     """
     A list with 2 copies of the same seq is de-duped to have 1 copy.
     """
     seq = '>hey\nagtcagtcagtc'
     s1 = SeqIO.read(StringIO(seq), 'fasta')
     s2 = SeqIO.read(StringIO(seq), 'fasta')
     self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2])), [s1])
Exemple #24
0
    def test_view(self):
        client = Client()
        url = reverse('multipartite_view_free')
        response = client.get(url)
        assert "pDGB2_alpha1R" in str(response)

        url = reverse('multipartite_view_free', kwargs={'form_num': '1'})

        response = client.post(url, {'vector': 'pDGB2_alpha1R',
                                     'part_1': 'pP2A11'})
        assert "An11" in str(response)

        url = reverse('multipartite_view_free', kwargs={'form_num': '2'})
        response = client.post(url, {'vector': 'pDGB2_alpha1R',
                                     'part_1': 'pP2A11',
                                     'part_2': 'pLuciferas'})
        assert 'feature does not exist' in str(response)

        response = client.post(url, {'vector': 'pDGB2_alpha1R',
                                     'part_1': 'pP2A11',
                                     'part_2': 'pLuciferase'})
        assert "pT35S" in str(response)

        response = client.post(url, {'vector': 'pDGB2_alpha1R',
                                     'part_1': 'pP2A11',
                                     'part_2': 'pLuciferase',
                                     'part_3': 'pT35S'})

        assert "<p>You have assembled in the GoldenBraid" in str(response)

        # reverse vector
        url = reverse('multipartite_view_free_genbank')
        response = client.post(url, {'part_1': 'pP2A11',
                                     'part_2': 'pMYB12',
                                     'part_3': 'pTerm2A11',
                                     'vector': 'pDGB1_alpha1R'})

        assert response.status_code == 200

        seqrec1 = SeqIO.read(StringIO(str(response)), 'gb')
        assert seqrec1.name == 'GB_UA_E'
        multipartite_free_seq1 = str(seqrec1.seq)
        gb_path = os.path.join(TEST_DATA, 'pEGBMybrev_uniq.gb')
        seqrec2 = SeqIO.read(gb_path, 'gb')
        multipartite_free_seq2 = str(seqrec2.seq)[4:]
        multipartite_free_seq2 += str(seqrec2.seq)[:4]

        assert multipartite_free_seq1 == multipartite_free_seq2

        # with more than one part of the same type
        url = reverse('multipartite_view_free', kwargs={'form_num': '5'})
        response = client.post(url, {'part_1': 'pP2A11',
                                     'part_2': 'GB0365',
                                     'part_3': 'GB0653',
                                     'part_4': 'GB0655',
                                     'part_5': 'pT35S',
                                     'vector': 'pDGB1_alpha1'})
        assert "<p>Other.2:<a href='/feature/GB0655'>GB0655</a></p>" in  str(response)
Exemple #25
0
	def __init__(self, seq_id=None, seq_type=None):
		"sets variables for instance"
		if seq_type is 'uniprot':
			handle = ExPASy.get_sprot_raw(seq_id)
			self.seq_record = SeqIO.read(handle, "swiss")
		elif seq_type is 'genbank':
			handle = Entrez.efetch(db='protein', rettype='genbank', id=seq_id)
			self.seq_record = SeqIO.read(handle, "genbank")
		handle.close()
 def test_001_negative_location_warning(self):
     with warnings.catch_warnings():
         warnings.simplefilter("error", BiopythonParserWarning)
         try:
             SeqIO.read(path.join("GenBank", "negative_location.gb"), "genbank")
         except BiopythonParserWarning as e:
             self.assertEqual(str(e), "Couldn't parse feature location: '-2..492'")
         else:
             self.assertTrue(False, "Expected specified BiopythonParserWarning here.")
    def read_sequences(self, source, destination):
        """
        Reads in the source and destination FASTA files.
        """

        self.src_nt = SeqIO.read(source, 'fasta', alphabet=generic_dna)
        self.des_aa = SeqIO.read(destination,
                                 'fasta',
                                 alphabet=generic_protein)
 def testRemovalOfPrefixSuffixAndDuplicate(self):
     """
     Prefixes, suffixes, and duplicates should collectively all be removed.
     """
     s1 = SeqIO.read(StringIO('>s1\nagtcagtcagtc'), 'fasta')
     s2 = SeqIO.read(StringIO('>s2\nagtcagtcagtc'), 'fasta')
     s3 = SeqIO.read(StringIO('>s3\nagtcagt'), 'fasta')
     s4 = SeqIO.read(StringIO('>s4\ntcagtc'), 'fasta')
     self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2, s3, s4])), [s1])
Exemple #29
0
 def test_Genome(self):
     #"""Checking GenBank sequence vs FASTA fna file."""
     gb_record = SeqIO.read(open(self.gb_filename),"genbank")
     fa_record = SeqIO.read(open(self.fna_filename),"fasta")
     compare_record(gb_record, fa_record)
     if self.emblname is None:
         return
     embl_record = SeqIO.read(open(self.embl_filename),"embl")
     compare_record(gb_record, embl_record, expect_minor_diffs=True)
def draw_pairwise(args, dwg, n1, n2, pos1, pos2, y_start):
    g = dwg.g()
    x_start = args.margin
    fasta1 = path.join(args.patser_directory, n1 + '.fasta')
    fasta2 = path.join(args.patser_directory, n2+'.fasta')
    if path.exists(fasta1) and path.exists(fasta2):
        seq1 = SeqIO.read(fasta1, 'fasta').seq
        seq2 = SeqIO.read(fasta2, 'fasta').seq
    elif args.fasta:
        if n1 in args.fasta and n2 in args.fasta:
            seq1 = args.fasta[n1].seq
            seq2 = args.fasta[n2].seq
        else:
            print("Can't find both sequences: {} and {}".format(n1, n2))
            seq1 = defaultdict(lambda : 'N')
            seq2 = defaultdict(lambda : 'N')
    else:
        seq1 = defaultdict(lambda : 'N')
        seq2 = defaultdict(lambda : 'N')

    lines.append(dwg.line(
        (x_start, y_start),
        (x_start + args.x_scale * len(pos1), y_start)))

    indels = diff(pos2) - diff(pos1)
    id_start = 0
    # Draw Indels
    for val, group in it.groupby(indels):
        for i, _ in enumerate(group):
            pass
        i += 1
        g.add(dwg.rect(
            (x_start + args.x_scale * id_start, y_start - .1 * args.y_sep * (val < 0)),
            (args.x_scale * i, .1 * args.y_sep * (val != 0)),
            fill="grey"
            ))
        # SNPs
        if val == 0:
            for j in range(id_start, id_start + i):
                if str(seq1[pos1[j]]) != str(seq2[pos2[j]]):
                    g.add(dwg.line(
                        (x_start + args.x_scale * j, y_start - .3 * args.y_sep),
                        (x_start + args.x_scale * j, y_start),
                        style="stroke-width:1; stroke:{};".format(seq_colors[str(seq1[pos1[j]])]),
                        ))
                    g.add(dwg.line(
                        (x_start + args.x_scale * j, y_start),
                        (x_start + args.x_scale * j, y_start + .3 * args.y_sep),
                        id='{}:{}--{}>{}'.format(pos1[j], pos2[j], seq1[pos1[j]], seq2[pos2[j]],),
                        style="stroke-width:1; stroke:{};".format(seq_colors[str(seq2[pos2[j]])]),
                        ))

        id_start += i
    y_start += 0.5 * delta_y
    dwg.add(g)
    return y_start
                        type=str,
                        required=True,
                        help="reference sequence")
    parser.add_argument("--metadata", type=str, required=True, help="metadata")
    parser.add_argument("--focal-alignment",
                        type=str,
                        required=True,
                        help="focal smaple of sequences")
    parser.add_argument("--output",
                        type=str,
                        required=True,
                        help="FASTA file of output alignment")
    args = parser.parse_args()

    # load entire alignment and the alignment of focal sequences (upper case -- probably not necessary)
    ref = sequence_to_int_array(SeqIO.read(args.reference, 'genbank').seq)
    context_seqs_dict = calculate_snp_matrix(args.alignment, consensus=ref)
    focal_seqs_dict = calculate_snp_matrix(args.focal_alignment, consensus=ref)
    alignment_length = len(ref)
    print("Done reading the alignments.")

    # calculate number of masked sites in either set
    mask_count_focal = np.array(
        [len(x) for x in focal_seqs_dict['filled_positions']])
    mask_count_context = {
        s: len(x)
        for s, x in zip(context_seqs_dict['names'],
                        context_seqs_dict['filled_positions'])
    }

    # for each context sequence, calculate minimal distance to focal set, weigh with number of N/- to pick best sequence
Exemple #32
0
#!/usr/bin/env python

import argparse
from Bio import SeqIO, Restriction
from Bio.Alphabet import IUPAC

parser = argparse.ArgumentParser(
    description=
    'record MseI sites for given chromosome FASTA in bed file format')
parser.add_argument('-f',
                    help='chromsome FASTA file (ex. chr21.fa)',
                    type=str,
                    dest='f',
                    required=True)
args = parser.parse_args()

seq_record = SeqIO.read(args.f, "fasta", IUPAC.ambiguous_dna)
coords = Restriction.MseI.search(seq_record.seq)

chrom = seq_record.id
OUT = open('MseI_sites_' + chrom + '.bed', 'w')
for start in coords:
    # Note: compensate for search function finding first base after the
    # position the enzyme will cut.
    OUT.write('\t'.join([chrom, str(start - 2), str((start - 2) + 4)]) + '\n')
OUT.close()
Exemple #33
0
from __future__ import print_function

try:
    from cStringIO import StringIO
except ImportError:
    from io import StringIO

from Bio import SeqIO
import requests
""" Get all families for human """
url = "https://dfam.org/api/families"
params = {
    "format": "summary",
    "clade": "9606",
    "clade_relatives": "both",
}
response = requests.get(url, params=params)
results = response.json()["results"]

records = []
for r in results:
    if r['repeat_type_name'] == 'LTR':
        nurl = url + '/' + r['accession'] + '/sequence'
        response2 = requests.get(nurl, params={'format': 'embl'})
        rec = SeqIO.read(StringIO(response2.text.encode('ascii', 'ignore')),
                         'embl')
        records.append(rec)

SeqIO.write(records, 'ERV_human.dfam.gb', 'genbank')
SeqIO.write(records, 'ERV_human.dfam.fasta', 'fasta')
Exemple #34
0
#print removals
for removal in genomeRemoved:
    print("Removed: " + removal + ' --incomplete genome')
print()
for removal in taxonRemoved:
    print("Removed: " + removal + ' --not a virus')
outf = open(outputFile, 'w')
for refID in blastInfos.keys():
    flag = True
    while flag:
        try:
            handle = Entrez.efetch(db="nucleotide",
                                   id=refID,
                                   rettype="gb",
                                   retmode="text")
            record = SeqIO.read(handle, "genbank")
            handle.close()
            seq = record.seq
            try:
                name = record.description
            except:
                name = 'name-not-found'
                print('nnn')
            outf.write('>' + refID + '\n')
            outf.write(str(seq) + '\n')
            flag = False
        except:
            time.sleep(10)
print('sequences written to: ' + outputFile)
os.remove(flagfileName)
def load_default_plastid():
    return SeqIO.read("Plastids/Arabidopsis_thaliana.gb", 'gb')
#!/usr/bin/env python

# http://biopython.org/DIST/docs/tutorial/Tutorial.html
# 20.1.13. Identifying open reading frames
# https://biopython.readthedocs.io/en/latest/Tutorial/chapter_cookbook.html

from Bio import SeqIO
record = SeqIO.read("NC_005816.fna", "fasta")
table = 11
min_pro_len = 100

for strand, nuc in [(+1, record.seq), (-1, record.seq.reverse_complement())]:
    for frame in range(3):
        length = 3 * ((len(record) - frame) // 3)  #Multiple of three
        for pro in nuc[frame:frame + length].translate(table).split("*"):
            if len(pro) >= min_pro_len:
                print("%s...%s - length %i, strand %i, frame %i" \
                % (pro[:30], pro[-3:], len(pro), strand, frame))
Exemple #37
0
#! /usr/bin/python

from sys import argv
from Bio import pairwise2  # uses Biopython wrapper for pairwise alignment
from Bio import SeqIO  # to parse the sequences
seq1 = SeqIO.read(argv[1], 'fasta')
seq2 = SeqIO.read(argv[2], 'fasta')
alignments = pairwise2.align.globalxx(seq1.seq, seq2.seq)  
# xx represents two character code to determine first the match score 
and then the cost for gaps

print(pairwise2.format_alignment(*alignments[0]))
Exemple #38
0
from Bio import SeqIO

record = SeqIO.read('sequenceBST2.gb', 'genbank')
#Informação sobre a seqência
print(record.id, '\n')
print(record.seq, '\n')
print(record.description, '\n')
print(record.name, '\n')
print(len(record.seq), '\n')
print(record.dbxrefs, '\n')
print(record.annotations["source"], '\n')
#anotações, features e qualifiers
for k, v in record.annotations.items():
    print(k, v)
print('\n', len(record.features), '\n')
for i in record.features:
    print(i)
print('\n', record.features, '\n')
Exemple #39
0
def seq_record_loaded_from_file_example(fasta_path):
    """Original SeqRecord loaded from sequence file"""
    return SeqIO.read(fasta_path, "fasta")
Exemple #40
0
    def test_uni001(self):
        "Parsing Uniprot file uni001"
        filename = 'uni001'
        # test the record parser

        datafile = os.path.join('SwissProt', filename)

        with open(datafile) as test_handle:
            seq_record = SeqIO.read(test_handle, "uniprot-xml")

        self.assertTrue(isinstance(seq_record, SeqRecord))

        # test a couple of things on the record -- this is not exhaustive
        self.assertEqual(seq_record.id, "Q91G55")
        self.assertEqual(seq_record.name, "043L_IIV6")
        self.assertEqual(seq_record.description,
                         "Uncharacterized protein 043L")
        self.assertEqual(
            repr(seq_record.seq),
            "Seq('MDLINNKLNIEIQKFCLDLEKKYNINYNNLIDLWFNKESTERLIKCEVNLENKI...IPI', ProteinAlphabet())"
        )

        # self.assertEqual(seq_record.accessions, ['Q91G55']) #seq_record.accessions does not exist
        # self.assertEqual(seq_record.organism_classification, ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Mammalia', 'Eutheria', 'Primates', 'Catarrhini', 'Hominidae', 'H**o'])
        # self.assertEqual(record.seqinfo, (348, 39676, '75818910'))

        self.assertEqual(len(seq_record.features), 1)
        self.assertEqual(
            repr(seq_record.features[0]),
            "SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(116)), type='chain', id='PRO_0000377969')"
        )

        self.assertEqual(len(seq_record.annotations['references']), 2)
        self.assertEqual(seq_record.annotations['references'][0].authors,
                         'Jakob N.J., Mueller K., Bahr U., Darai G.')
        self.assertEqual(
            seq_record.annotations['references'][0].title,
            'Analysis of the first complete DNA sequence of an invertebrate iridovirus: coding strategy of the genome of Chilo iridescent virus.'
        )
        self.assertEqual(seq_record.annotations['references'][0].journal,
                         'Virology 286:182-196(2001)')
        self.assertEqual(
            seq_record.annotations['references'][0].comment,
            'journal article | 2001 | Scope: NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA] | '
        )

        self.assertEqual(len(seq_record.dbxrefs), 11)
        self.assertEqual(seq_record.dbxrefs[0], 'DOI:10.1006/viro.2001.0963')

        self.assertEqual(seq_record.annotations['sequence_length'], 116)
        self.assertEqual(seq_record.annotations['sequence_checksum'],
                         '4A29B35FB716523C')
        self.assertEqual(seq_record.annotations['modified'], '2009-07-07')
        self.assertEqual(seq_record.annotations['accessions'], ['Q91G55'])
        self.assertEqual(seq_record.annotations['taxonomy'], [
            'Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae',
            'Iridovirus'
        ])
        self.assertEqual(seq_record.annotations['sequence_mass'], 13673)
        self.assertEqual(seq_record.annotations['dataset'], 'Swiss-Prot')
        self.assertEqual(seq_record.annotations['gene_name_ORF'],
                         ['IIV6-043L'])
        self.assertEqual(seq_record.annotations['version'], 21)
        self.assertEqual(seq_record.annotations['sequence_modified'],
                         '2001-12-01')
        self.assertEqual(seq_record.annotations['keywords'],
                         ['Complete proteome', 'Virus reference strain'])
        self.assertEqual(seq_record.annotations['organism_host'], [
            'Acheta domesticus', 'House cricket', 'Chilo suppressalis',
            'striped riceborer', 'Gryllus bimaculatus', 'Two-spotted cricket',
            'Gryllus campestris', 'Spodoptera frugiperda', 'Fall armyworm'
        ])
        self.assertEqual(seq_record.annotations['created'], '2009-06-16')
        self.assertEqual(seq_record.annotations['organism_name'],
                         ['Chilo iridescent virus'])
        self.assertEqual(seq_record.annotations['organism'],
                         'Invertebrate iridescent virus 6 (IIV-6)')
        self.assertEqual(seq_record.annotations['recommendedName_fullName'],
                         ['Uncharacterized protein 043L'])
        self.assertEqual(seq_record.annotations['sequence_version'], 1)
        self.assertEqual(seq_record.annotations['proteinExistence'],
                         ['Predicted'])
Exemple #41
0
    def test_uni003(self):
        "Parsing Uniprot file uni003"
        filename = 'uni003'
        # test the record parser

        datafile = os.path.join('SwissProt', filename)

        test_handle = open(datafile)
        seq_record = SeqIO.read(test_handle, "uniprot-xml")
        test_handle.close()

        self.assertTrue(isinstance(seq_record, SeqRecord))

        # test general record entries
        self.assertEqual(seq_record.id, "O44185")
        self.assertEqual(seq_record.name, "FLP13_CAEEL")
        self.assertEqual(seq_record.description,
                         "FMRFamide-like neuropeptides 13")
        self.assertEqual(
            repr(seq_record.seq),
            "Seq('MMTSLLTISMFVVAIQAFDSSEIRMLDEQYDTKNPFFQFLENSKRSDRPTRAMD...GRK', ProteinAlphabet())"
        )

        self.assertEqual(len(seq_record.annotations['references']), 7)
        self.assertEqual(seq_record.annotations['references'][5].authors,
                         'Kim K., Li C.')
        self.assertEqual(
            seq_record.annotations['references'][5].title,
            'Expression and regulation of an FMRFamide-related '
            'neuropeptide gene family in Caenorhabditis elegans.')
        self.assertEqual(seq_record.annotations['references'][5].journal,
                         'J. Comp. Neurol. 475:540-550(2004)')
        self.assertEqual(
            seq_record.annotations['references'][5].comment,
            'journal article | 2004 | Scope: TISSUE SPECIFICITY, '
            'DEVELOPMENTAL STAGE | ')

        self.assertEqual(seq_record.annotations["accessions"], ['O44185'])
        self.assertEqual(seq_record.annotations["created"], "2004-05-10")
        self.assertEqual(seq_record.annotations["dataset"], "Swiss-Prot")
        self.assertEqual(seq_record.annotations["gene_name_ORF"], ['F33D4.3'])
        self.assertEqual(seq_record.annotations["gene_name_primary"], "flp-13")
        self.assertEqual(seq_record.annotations["keywords"], [
            'Amidation', 'Cleavage on pair of basic residues',
            'Complete proteome', 'Direct protein sequencing', 'Neuropeptide',
            'Reference proteome', 'Repeat', 'Secreted', 'Signal'
        ])
        self.assertEqual(seq_record.annotations["modified"], "2012-11-28")
        self.assertEqual(seq_record.annotations["organism"],
                         "Caenorhabditis elegans")
        self.assertEqual(seq_record.annotations["proteinExistence"],
                         ['evidence at protein level'])
        self.assertEqual(seq_record.annotations["recommendedName_fullName"],
                         ['FMRFamide-like neuropeptides 13'])
        self.assertEqual(seq_record.annotations["sequence_length"], 160)
        self.assertEqual(seq_record.annotations["sequence_checksum"],
                         "BE4C24E9B85FCD11")
        self.assertEqual(seq_record.annotations["sequence_mass"], 17736)
        self.assertEqual(seq_record.annotations["sequence_modified"],
                         "1998-06-01")
        self.assertEqual(seq_record.annotations["sequence_precursor"], "true")
        self.assertEqual(seq_record.annotations["sequence_version"], 1)
        self.assertEqual(seq_record.annotations["taxonomy"], [
            'Eukaryota', 'Metazoa', 'Ecdysozoa', 'Nematoda', 'Chromadorea',
            'Rhabditida', 'Rhabditoidea', 'Rhabditidae', 'Peloderinae',
            'Caenorhabditis'
        ])
        self.assertEqual(seq_record.annotations["type"],
                         ['ECO:0000006', 'ECO:0000001'])
        self.assertEqual(seq_record.annotations["version"], 74)

        # test comment entries
        self.assertEqual(seq_record.annotations["comment_allergen"],
                         ['Causes an allergic reaction in human.'])
        self.assertEqual(
            seq_record.annotations["comment_alternativeproducts_isoform"],
            ['Q8W1X2-1', 'Q8W1X2-2'])
        self.assertEqual(seq_record.annotations["comment_biotechnology"], [
            'Green fluorescent protein has been engineered to produce a '
            'vast number of variously colored mutants, fusion proteins, '
            'and biosensors. Fluorescent proteins and its mutated allelic '
            'forms, blue, cyan and yellow have become a useful and '
            'ubiquitous tool for making chimeric proteins, where they '
            'function as a fluorescent protein tag. Typically they '
            'tolerate N- and C-terminal fusion to a broad variety of '
            'proteins. They have been expressed in most known cell types '
            'and are used as a noninvasive fluorescent marker in living '
            'cells and organisms. They enable a wide range of applications '
            'where they have functioned as a cell lineage tracer, reporter '
            'of gene expression, or as a measure of protein-protein '
            'interactions.', 'Can also be used as a molecular thermometer, '
            'allowing accurate temperature measurements in fluids. The '
            'measurement process relies on the detection of the blinking '
            'of GFP using fluorescence correlation spectroscopy.'
        ])
        self.assertEqual(seq_record.annotations["comment_catalyticactivity"], [
            'ATP + acetyl-CoA + HCO(3)(-) = ADP + phosphate + malonyl-CoA.',
            'ATP + biotin-[carboxyl-carrier-protein] + CO(2) = ADP + '
            'phosphate + carboxy-biotin-[carboxyl-carrier-protein].'
        ])
        self.assertEqual(seq_record.annotations["comment_caution"], [
            'Could be the product of a pseudogene. The existence of a '
            'transcript at this locus is supported by only one sequence '
            'submission (PubMed:2174397).'
        ])
        self.assertEqual(seq_record.annotations["comment_cofactor"], [
            'Biotin (By similarity).', 'Binds 2 manganese ions per '
            'subunit (By similarity).'
        ])
        self.assertEqual(
            seq_record.annotations["comment_developmentalstage"], [
                'Expressed from the comma stage of embryogenesis, during all '
                'larval stages, and in low levels in adults.'
            ])
        self.assertEqual(seq_record.annotations["comment_disease"], [
            'Defects in MC2R are the cause of glucocorticoid deficiency '
            'type 1 (GCCD1) [MIM:202200]; also known as familial '
            'glucocorticoid deficiency type 1 (FGD1). GCCD1 is an '
            'autosomal recessive disorder due to congenital '
            'insensitivity or resistance to adrenocorticotropin (ACTH). '
            'It is characterized by progressive primary adrenal '
            'insufficiency, without mineralocorticoid deficiency.'
        ])
        self.assertEqual(
            seq_record.annotations["comment_disruptionphenotype"], [
                'Mice display impaired B-cell development which does not '
                'progress pass the progenitor stage.'
            ])
        self.assertEqual(seq_record.annotations["comment_domain"], [
            'Two regions, an N-terminal (aa 96-107) and a C-terminal '
            '(aa 274-311) are required for binding FGF2.'
        ])
        self.assertEqual(seq_record.annotations["comment_enzymeregulation"], [
            'By phosphorylation. The catalytic activity is inhibited by '
            'soraphen A, a polyketide isolated from the myxobacterium '
            'Sorangium cellulosum and a potent inhibitor of fungal growth.'
        ])
        self.assertEqual(seq_record.annotations["comment_function"], [
            'FMRFamides and FMRFamide-like peptides are neuropeptides. '
            'AADGAPLIRF-amide and APEASPFIRF-amide inhibit muscle tension '
            'in somatic muscle. APEASPFIRF-amide is a potent inhibitor of '
            'the activity of dissected pharyngeal myogenic muscle system.'
        ])
        self.assertEqual(seq_record.annotations["comment_induction"], [
            'Repressed in presence of fatty acids. Repressed 3-fold by '
            'lipid precursors, inositol and choline, and also controlled '
            'by regulatory factors INO2, INO4 and OPI1.'
        ])
        self.assertEqual(
            seq_record.annotations["comment_interaction_intactId"],
            ['EBI-356720', 'EBI-746969', 'EBI-720116'])
        self.assertEqual(seq_record.annotations["comment_massspectrometry"],
                         ['88..98:1032|MALDI', '100..110:1133.7|MALDI'])
        self.assertEqual(
            seq_record.annotations["comment_miscellaneous"],
            ['Present with 20200 molecules/cell in log phase SD medium.'])
        self.assertEqual(
            seq_record.annotations["comment_onlineinformation"],
            ['NIEHS-SNPs@http://egp.gs.washington.edu/data/api5/'])
        self.assertEqual(seq_record.annotations["comment_pathway"], [
            'Lipid metabolism; malonyl-CoA biosynthesis; malonyl-CoA '
            'from acetyl-CoA: step 1/1.'
        ])
        self.assertEqual(seq_record.annotations["comment_RNAediting"], [
            'Partially edited. RNA editing generates receptor isoforms '
            'that differ in their ability to interact with the '
            'phospholipase C signaling cascade in a transfected cell '
            'line, suggesting that this RNA processing event may '
            'contribute to the modulation of serotonergic '
            'neurotransmission in the central nervous system.'
        ])
        self.assertEqual(
            seq_record.annotations["comment_PTM"],
            ['Acetylation at Lys-251 impairs antiapoptotic function.'])
        self.assertEqual(seq_record.annotations["comment_pharmaceutical"], [
            'Could be used as a possible therapeutic agent for treating '
            'rheumatoid arthritis.'
        ])
        self.assertEqual(seq_record.annotations["comment_polymorphism"], [
            'Position 23 is polymorphic; the frequencies in unrelated '
            'Caucasians are 0.87 for Cys and 0.13 for Ser.'
        ])
        self.assertEqual(
            seq_record.annotations["comment_similarity"],
            ['Belongs to the FARP (FMRFamide related peptide) family.'])
        self.assertEqual(
            seq_record.annotations["comment_subcellularlocation_location"],
            ['Secreted'])
        self.assertEqual(seq_record.annotations["comment_subunit"],
                         ['Homodimer.'])
        self.assertEqual(seq_record.annotations["comment_tissuespecificity"], [
            'Each flp gene is expressed in a distinct set of neurons. '
            'Flp-13 is expressed in the ASE sensory neurons, the DD motor '
            'neurons, the 15, M3 and M5 cholinergic pharyngeal '
            'motoneurons, and the ASG, ASK and BAG neurons.'
        ])
        self.assertEqual(seq_record.annotations["comment_toxicdose"], [
            'LD(50) is 50 ug/kg in mouse by intracerebroventricular '
            'injection and 600 ng/g in Blatella germanica.'
        ])
Exemple #42
0
 def test_Q13639(self):
     """Compare SwissProt text and uniprot XML versions of Q13639."""
     old = SeqIO.read("SwissProt/Q13639.txt", "swiss")
     new = SeqIO.read("SwissProt/Q13639.xml", "uniprot-xml")
     self.compare_txt_xml(old, new)
def main():
    """The main function
  """
    try:
        parser = cmdline_parser()
        (opts, args) = parser.parse_args()
        print opts, args
        if len(args):
            parser.error("Unrecognized arguments found: %s." %
                         (' '.join(args)))
            sys.exit(1)

    except:
        parser.print_help()
        sys.exit(0)
    Entrez.email = opts.Entrez_email
    threads = str(3)
    tab_fmt = "'6 qseqid sseqid qstart qend sstart send pident qcovs evalue bitscore stitle'"
    print opts.f1, args
    # Read the summary file:
    info_dict = {}
    with open(opts.csv_file, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for xi in reader:
            print xi
            info_dict[xi[-2]] = xi
    print info_dict.keys()

    os.chdir(opts.fq_dir)
    ref = opts.ref_dir + "/" + info_dict[opts.f1.replace("-Qc.fastq",
                                                         "")][-1] + ".fa"
    if not os.path.exists(opts.out_dir): os.mkdir(opts.out_dir)
    res_dir = opts.out_dir + "/" + opts.f1.replace("-Qc.fastq", "")
    if not os.path.exists(res_dir): os.mkdir(res_dir)
    flag = 0
    for x in SeqIO.parse(open(ref, "r"), "fasta"):
        split_ref = res_dir + "/" + x.id.replace("_", "").replace(
            "|", "").replace("/", "") + ".ref"
        with open(split_ref, "w") as ff:
            ff.write(">" + x.id + "\r\n" + str(x.seq) + "\r\n")

        # Map reads to a reference genome...
        cons_ref = res_dir + "/" + opts.f1.replace(
            "-Qc.fastq",
            x.id.replace("_", "").replace("|", "").replace("/", "") + ".fa")
        cons_cmd = "bam2cons_iter-lenient.sh  -f " + opts.f1 + " -r " + split_ref + " -t " + threads + " --force -o " + cons_ref
        print cons_cmd
        if not os.path.isfile(cons_ref): os.system(cons_cmd)

        cons = SeqIO.parse(cons_ref, "fasta")
        nee = [xl for xl in cons]
        if len(nee) != 0:
            X = []
            for xl in nee:
                nnn = str(xl.seq).replace("N", "")
                if len(nnn) > 30: X.append(nnn)
        else:
            X = []
        print "NEE", nee, X
        if len(nee) != 0 and len(X) != 0:
            # Blast the consensus formed....
            b_out = res_dir + "/" + opts.f1.replace(".fastq", ".tmp")
            blast_run_uni = NcbiblastnCommandline(cmd=opts.blast_n,
                                                  task="megablast",
                                                  db=opts.db,
                                                  max_target_seqs=1,
                                                  query=cons_ref,
                                                  outfmt=tab_fmt,
                                                  out=b_out)
            print "Runnin blast...", blast_run_uni
            nearest_ref1 = cons_ref.replace(".fa", "-nref1.fa")
            if not os.path.isfile(nearest_ref1):
                stdout, stderr = blast_run_uni()
                c = 0
                for line in open(b_out, "r"):
                    c = c + 1
                    if c == 1:
                        u_id = line.split("|")
                        custom = line.split("\t")
                print "u_id", u_id, custom
                if opts.database_type == "Custom":
                    cmd_cus = "blastdbcmd -entry '" + custom[
                        1] + "' -db " + opts.db + " > " + nearest_ref1
                    print cmd_cus
                    os.system(cmd_cus)
                elif opts.database_type == "NCBI":
                    handle = Entrez.efetch(
                        db="nuccore",
                        id=u_id[3],
                        rettype="gb",
                        retmode="text",
                        idtype="acc")  # 		  Use the PrimaryID instead of GI
                    #handle = Entrez.efetch(db="nucleotide", id="AY851612", rettype="gb", retmode="xml")
                    #handle = "elink -db nuccore -query " + u_id[3] + " |efetch -format gb "
                    #print handle
                    record = SeqIO.read(handle, 'genbank')
                    #record = Entrez.read(handle) #, validate=False)
                    #print record
                    with open(nearest_ref1, "w") as f:
                        #f.write(">" + record[0]["GBSeq_primary-accession"] + "\r\n" + record[0]["GBSeq_sequence"] + "\r\n")
                        f.write(">" + record.id + "\r\n" + str(record.seq) +
                                "\r\n")

        # 2-iteration Map reads to nearest reference...
        # IonXpress_012-Qc.fastq
            ddd = opts.f1.replace("-Qc.fastq", "")
            cons_ref2 = res_dir + "/" + info_dict[ddd][
                0] + "-" + x.id + "cons2.fa"
            cons_cmd2 = "bam2cons_iter-lenient.sh -f " + opts.f1 + " -r " + nearest_ref1 + " -t " + threads + " --force -o " + cons_ref2
            print cons_cmd2
            if not os.path.isfile(cons_ref2): os.system(cons_cmd2)

            # combine all seqments into one ref...
            flag = 1
            Cons_ref_ful = res_dir + "/" + info_dict[ddd][0] + "-genome.fa"
            for i in SeqIO.parse(open(cons_ref2, "r"), "fasta"):
                with open(Cons_ref_ful, "a") as f:
                    print i
                    f.write(">" + info_dict[ddd][1].replace(" ", "") + "|" +
                            i.id.split("-")[1].replace("cons2", "") + "|" +
                            "\r\n" + str(i.seq) + "\r\n")
    else:
        print "No Coverage :( "

    # Map reads to consensus genome.... Bowtie2
    if flag == 1:
        cons_ref2_indx = "bowtie2-build " + Cons_ref_ful + " tmp_idx"
        os.system(cons_ref2_indx)
        out_sam = Cons_ref_ful.replace(".fa", ".sam")
        bowtie2_cmd = "bowtie2 --local --fast-local  -x  tmp_idx  -U " + opts.f1 + " -S  " + out_sam + " -p 8"
        os.system(bowtie2_cmd)

        # run lofreq2....
        bam_cmd = "samtools view -b -S " + out_sam + " > " + out_sam.replace(
            ".sam", ".bam")
        bam_sort = "samtools sort " + out_sam.replace(
            ".sam", ".bam") + " " + out_sam.replace(".sam", "-sort")
        cmd = "lofreq   call -C 100 -f  " + Cons_ref_ful + " -o " + out_sam.replace(
            ".sam", "-snps.vcf") + " " + out_sam.replace(".sam", "-sort.bam")
        # 100 is min depth required to call SNPs
        print bam_cmd
        os.system(bam_cmd)
        print bam_sort
        os.system(bam_sort)
        print cmd
        os.system(cmd)
        print "\n\n"
    else:
        print "Not this strain..."
Exemple #44
0
        for hit in self.rc_hits:
            # hit[0] x coordinate, hit[3] list of y coordinate
            x = [hit[0]] * len(hit[3])
            plt.scatter(x, hit[3])

        for aligned_hit in self.rc_chain:
            x = [aligned_hit[0]] * len(aligned_hit[3])
            plt.scatter(x, aligned_hit[3], edgecolors="black", linewidths=2)

        plt.show()


if __name__ == '__main__':
    # record1 = SeqIO.read("D:/Data/20170627/missing/missing_query_025.fasta", "fasta")
    # record2 = SeqIO.read("D:/Data/20170627/missing/missing_target_025.fasta", "fasta")
    # record1 = SeqIO.read("D:/Data/20170622/9mer_FP/FP_query_002.fasta", "fasta")
    # record2 = SeqIO.read("D:/Data/20170622/9mer_FP/FP_target_002.fasta", "fasta")
    record1 = SeqIO.read("D:/Data/20170706/FP_dustboth/FP_query_100.fasta", "fasta")
    record2 = SeqIO.read("D:/Data/20170706/FP_dustboth/FP_target_100.fasta", "fasta")
    test_filter = PseudoBloomFilter.PseudoBloomFilter(record2, 9, 54)
    print test_filter.L
    test_filter.generate_filter()
    test_query = QuerySeq(record1)
    test_query.check_kmer(test_filter)
    # print(test_query.fw_hits)
    # print(test_query.rc_hits)
    test_query.cluster_hits(size_threshold=3, debug=True, group_hit=1.0)
    print test_query.chain_align
    print test_query.aligned
    test_query.plot()
Exemple #45
0
def parse_fasta():
    # read and parse the FASTA file
    return SeqIO.read(args.fasta, 'fasta')
Exemple #46
0
def main():
    
    start_time = time.time()

    args = parse_args()

    unique_results_files = list(OrderedDict.fromkeys(args.tables))
    list_of_isolates = []

    # key1 = (start, end), key2 = isolate, value = +/*/?
    #list_of_positions = collections.defaultdict(dict)
    list_of_positions = []
    # key1 = (start, end), key2 = ref, value = +
    #list_of_ref_positions = collections.defaultdict(dict)
    # key = (start, end), value = orientation (F/R)
    #position_orientation = {}

    reference_fasta = args.reference_gbk.split('.g')[0]
    # Create a fasta file of the reference for BLAST
    print 'Creating fasta file and database of reference ...'
    gbk_to_fasta(args.reference_gbk, reference_fasta)
    # Make a BLAST database
    blast_db(reference_fasta)
    # Get the reference positions and orientations for this IS query
    print '\nGetting query positions in reference ...'
    list_of_positions, ref_name = get_ref_positions(reference_fasta, args.seq, list_of_positions)

    elapsed_time = time.time() - start_time
    print 'Time taken: ' + str(elapsed_time)
    #print list_of_positions
    #print ref_name
    # Loop through each table give to --tables
    print 'Collating results files ...'
    for result_file in unique_results_files:
        # Get isolate name
        isolate = result_file.split('_table.txt')[0]
        list_of_isolates.append(isolate)
        # Skip the header
        header = 0
        with open(result_file) as file_open:
            for line in file_open:
                # Skip header
                if header == 0:
                    header += 1
                # Check to make sure there were actually hits
                elif 'No hits found' not in line and line != '':
                    info = line.strip('\n').split('\t')
                    # Get orientation for hit and start/end coordinates
                    orientation = info[1]
                    is_start = min(int(info[2]), int(info[3]))
                    is_end = max(int(info[3]), int(info[2]))
                    # Note whether call is Known, Novel or Possible related IS
                    call = info[5]
                    # See if this position is already in the list of positions
                    match = False
                    isolate_dict = {}
                    for pos in list_of_positions:
                        if pos.x == is_start and pos.y == is_end and pos.orientation == orientation:
                            # Then this position already exists
                            match = True
                            # And we want to retreive the position to which it is exactly the same
                            matching_pos = pos
                            # Then we want to add the info about this new position to the list
                            if '?' in call:
                                matching_pos.isolate_dict[isolate] = '?'
                            elif '*' in call:
                                matching_pos.isolate_dict[isolate] = '*'
                            else:
                                matching_pos.isolate_dict[isolate] = '+'
                    
                    # So we haven't seen this position before
                    if match == False:
                        # The position list is empty, so there's nothing to check against, so just add
                        # this new position
                        if list_of_positions == []:
                            if '?' in call:
                                isolate_dict[isolate] = '?'
                            elif '*' in call:
                                isolate_dict[isolate] = '*'
                            else:
                                isolate_dict[isolate] = '+'
                            new_pos = Position(is_start, is_end, orientation, isolate_dict, call, None, None)
                            list_of_positions.append(new_pos)

                        # If the list of positions isn't empty, then there are ranges to check against
                        else:
                            if args.tolerance == -1:
                                old_position, new_range = check_ranges(list_of_positions, (is_start, is_end), args.gap, orientation, call, isolate)
                            else:
                                old_position, new_range = check_ranges_tol(list_of_positions, (is_start, is_end), args.tolerance, orientation, call, isolate)
                            # So the current range overlaps with a range we already have
                            if old_position != False:
                                isolate_dict = old_position.isolate_dict
                                # Add the new isolate to this dictionary
                                # Mark as ? if uncertain, * if imprecise
                                # or + if confident
                                if '?' in call:
                                    isolate_dict[isolate] = '?'
                                elif '*' in call:
                                    isolate_dict[isolate] = '*'
                                else:
                                    isolate_dict[isolate] = '+'
                                # Remove the old position from the list
                                list_of_positions.remove(old_position)
                                # Create the new position and add it
                                new_pos = Position(new_range[0], new_range[1], orientation, isolate_dict, call, None, None,
                                                   old_position.xs, old_position.ys, is_start, is_end)
                                list_of_positions.append(new_pos)
                            # Otherwise this range hasn't been seen before, so all values are False
                            else:
                                if '?' in call:
                                    isolate_dict[isolate] = '?'
                                elif '*' in call:
                                    isolate_dict[isolate] = '*'
                                else:
                                    isolate_dict[isolate] = '+'
                                new_pos = Position(is_start, is_end, orientation, isolate_dict, call, None, None)
                                list_of_positions.append(new_pos)

    elapsed_time = time.time() - start_time
    print 'Time taken: ' + str(elapsed_time)

    list_of_positions = [p for p in list_of_positions if len(p.isolate_dict) > args.drop]
    print 'Positions: ' + str(len(list_of_positions))

    #Check if all hits in every position within tolerance
    num_bad_positions = 0
    if args.tolerance > 0 or args.gap > 0:
        max_delta = 2*(args.tolerance if args.tolerance > 0 else args.gap)
        for p in list_of_positions:
            bad = False
            if max(p.xs) - min(p.xs) > max_delta:
                print "Inconsistensy x position in ", (p.x, p.y), " len ", len(p.xs), " delta ", max(p.xs) - min(p.xs)
                bad = True
            if max(p.ys) - min(p.ys) > max_delta:
                print "Inconsistensy y position in ", (p.x, p.y), " len ", len(p.xs), " delta ", max(p.ys) - min(p.ys)
                bad = True
            num_bad_positions += bad

        print "Total bad positions ", num_bad_positions

    # Get the flanking genes for each position now they've all been merged
    print 'Getting flanking genes for each position (this step is the longest and could take some time) ...'
    # key = (start, end), valye = [left_gene, right_gene]
    position_genes = {}

    # Get feature list
    gb = SeqIO.read(args.reference_gbk, "genbank")
    feature_list = []
    feature_count = 0
    feature_types = ["CDS", "tRNA", "rRNA"]

    for feature in gb.features:
        if feature.type in feature_types:
            feature_list.append([int(feature.location.start), int(feature.location.end), feature_count])
            feature_count += 1
        else:
            feature_count += 1
    # Sort the list just in case it's out of order (has caused issues in the past!!)
    feature_list = sorted(feature_list, key=itemgetter(0))
    # Get flanking genes
    for pos in list_of_positions:
        genes_before, genes_after =  get_flanking_genes(gb.features, feature_list, pos.x, pos.y, args.cds, args.trna, args.rrna, len(gb.seq))
        pos.left_feature = genes_before
        pos.right_feature = genes_after


    elapsed_time = time.time() - start_time
    print 'Time taken: ' + str(elapsed_time)

    # Order positions from smallest to largest for final table output
    list_of_positions.sort(key=lambda x: x.x)
    
    # Write out table
    print 'Writing output table to ' + args.output + ' ...'
    with open(args.output, 'w') as out:
        header = ['isolate']
        for pos in list_of_positions:
            if pos.orientation == 'F':
                header.append(str(pos.x) + '-' + str(pos.y))
            else:
                header.append(str(pos.y) + '-' + str(pos.x))
        out.write('\t'.join(header) + '\n')
        # Add the values for the reference positions
        row = [ref_name]
        for pos in list_of_positions:
            if ref_name in pos.isolate_dict.keys():
                row.append(pos.isolate_dict[ref_name])
            else:
                row.append('-')
        out.write('\t'.join(row) + '\n')
        
        # Loop through each isolate
        # and create each row
        for isolate in list_of_isolates:
            row = [isolate]
            for pos in list_of_positions:
                if isolate in pos.isolate_dict.keys():
                    row.append(pos.isolate_dict[isolate])
                else:
                    row.append('-')
            out.write('\t'.join(row) + '\n')
        # Set up flanking genes
        row_orientation = ['orientation']
        row_l_locus = ['left ID']
        row_r_locus = ['right ID']
        row_l_dist = ['left distance']
        row_r_dist = ['right distance']
        row_l_strand = ['left strand']
        row_r_strand = ['right strand']
        row_l_prod = ['left info']
        row_r_prod = ['right info']

        # Print orientation and flanking genes for each position
        for pos in list_of_positions:
            row_orientation.append(pos.orientation)
            row_l_locus.append(pos.left_feature[0])
            row_r_locus.append(pos.right_feature[0])
            row_l_dist.append(pos.left_feature[1])
            row_r_dist.append(pos.right_feature[1])
            row_l_strand.append(pos.left_feature[2][-1])
            row_r_strand.append(pos.right_feature[2][-1])
            row_l_prod.append(pos.left_feature[2])
            row_r_prod.append(pos.right_feature[2])
        out.write('\t'.join(row_orientation) + '\n')
        out.write('\t'.join(row_l_locus) + '\n')
        out.write('\t'.join(row_l_dist) + '\n')
        out.write('\t'.join(row_l_strand) + '\n')
        out.write('\t'.join(str(i) for i in row_l_prod) + '\n')
        out.write('\t'.join(row_r_locus) + '\n')
        out.write('\t'.join(row_r_dist) + '\n')
        out.write('\t'.join(row_r_strand) + '\n')
        out.write('\t'.join(str(i) for i in row_r_prod) + '\n')

    elapsed_time = time.time() - start_time
    print 'Table compilation finished in ' + str(elapsed_time)
Exemple #47
0
def main():
    sim = pt.Model(cell_volume=CELL_VOLUME)

    # Download T7 wild-type genbank records
    Entrez.email = "*****@*****.**"
    handle = Entrez.efetch(db="nuccore",
                           id=["NC_001604"],
                           rettype="gb",
                           retmode="text")

    record = SeqIO.read(handle, "genbank")
    genome_length = len(record.seq)
    phage = pt.Genome(name="phage", length=genome_length,
                      transcript_degradation_rate=1e-3)

    #phage = pt.Genome(name="phage", length=genome_length)

    for feature in record.features:
        weights = [0.0] * len(record.seq)
        # Convert to inclusive genomic coordinates
        start = feature.location.start.position + 1
        stop = feature.location.end.position
        name = ''
        if "note" in feature.qualifiers:
            name = feature.qualifiers["note"][0]
        # Grab promoters and terminators
        if feature.type == "regulatory":
            if name in IGNORE_REGULATORY:
                continue
            # Construct promoter
            if "promoter" in feature.qualifiers["regulatory_class"]:
                length = stop - start
                if length < 35:
                    start = start - 35
                interactions = get_promoter_interactions(name)
                phage.add_promoter(name, start, stop, interactions)
            # Construct terminator params
            if "terminator" in feature.qualifiers["regulatory_class"]:
                interactions = get_terminator_interactions(name)
                phage.add_terminator(name, start, stop, interactions)
        # Grab genes/CDSes
        if feature.type == "gene":
            if name in IGNORE_GENES:
                continue
            if name in RELABEL_GENES:
                name = RELABEL_GENES[name]
            # Construct CDS parameters for this gene
            phage.add_gene(name=name, start=start, stop=stop,
                           rbs_start=start - 30, rbs_stop=start, rbs_strength=1e7)
        if feature.type == "CDS":
            weights = compute_cds_weights(record, feature, 1.0, weights)
        if feature.type == "misc_structure":
            print(feature.qualifiers)
            phage.add_rnase_site(start=start, stop=start + 10)
        print(start, stop, name)

    mask_interactions = ["rnapol-1", "rnapol-3.5",
                         "ecolipol", "ecolipol-p", "ecolipol-2", "ecolipol-2-p"]
    phage.add_mask(500, mask_interactions)

    norm_weights = normalize_weights(weights)
    phage.add_weights(norm_weights)

    sim.register_genome(phage)

    sim.add_polymerase("rnapol-1", 35, 230, 0)
    sim.add_polymerase("rnapol-3.5", 35, 230, 0)
    sim.add_polymerase("ecolipol", 35, 45, 0)
    sim.add_polymerase("ecolipol-p", 35, 45, 0)
    sim.add_polymerase("ecolipol-2", 35, 45, 0)
    sim.add_polymerase("ecolipol-2-p", 35, 45, 0)

    sim.add_polymerase("ribosome", 30, 30, 0)

    sim.add_species("bound_ribosome", 10000)

    sim.add_species("bound_ecolipol", 1800)
    sim.add_species("bound_ecolipol_p", 0)
    sim.add_species("ecoli_genome", 0)
    sim.add_species("ecoli_transcript", 0)

    sim.add_reaction(1e6, ["ecoli_transcript", "ribosome"], ["bound_ribosome"])

    sim.add_reaction(0.04, ["bound_ribosome"], [
                     "ribosome", "ecoli_transcript"])

    sim.add_reaction(0.001925, ["ecoli_transcript"], ["degraded_transcript"])

    sim.add_reaction(1e7, ["ecolipol", "ecoli_genome"], ["bound_ecolipol"])

    sim.add_reaction(
        0.3e7, ["ecolipol-p", "ecoli_genome"], ["bound_ecolipol_p"])

    sim.add_reaction(0.04, ["bound_ecolipol"], [
                     "ecolipol", "ecoli_genome", "ecoli_transcript"])

    sim.add_reaction(0.04, ["bound_ecolipol_p"], [
                     "ecolipol-p", "ecoli_genome", "ecoli_transcript"])

    sim.add_reaction(3.8e7, ["protein_kinase-0.7", "ecolipol"],
                     ["ecolipol-p", "protein_kinase-0.7"])

    sim.add_reaction(3.8e7, ["protein_kinase-0.7", "ecolipol-2"],
                     ["ecolipol-2-p", "protein_kinase-0.7"])

    sim.add_reaction(3.8e7, ["gp-2", "ecolipol"], ["ecolipol-2"])

    sim.add_reaction(3.8e7, ["gp-2", "ecolipol-p"], ["ecolipol-2-p"])

    sim.add_reaction(1.1, ["ecolipol-2-p"], ["gp-2", "ecolipol-p"])

    sim.add_reaction(1.1, ["ecolipol-2"], ["gp-2", "ecolipol"])

    sim.add_reaction(3.8e9, ["lysozyme-3.5", "rnapol-1"], ["rnapol-3.5"])

    sim.add_reaction(3.5, ["rnapol-3.5"], ["lysozyme-3.5", "rnapol-1"])

    sim.seed(72)

    # sim.run(stop_time=1500, time_step=5, output_prefix="test")

    sim.run(stop_time=1500, time_step=5, output_prefix="degrade_test3")
#! /usr/bin/env python
# gb2tbl.py
#This script converts a genbank flat file to a features table suitable for use with Sequin.
#Usage gb2tbl.py <genbank flatfile name>
#Writes to standard output so redirect to a file if desired
#Aaron M. Duffy  aduffy70{at}gmail.com
#May 2010

from Bio import SeqIO  # tools for parsing genbank files
from sys import argv  # a list of command line arguments
import re  # tools for working with regular expressions

#Read the genbank flat file
gbFile = open(argv[1], 'r')
gbRecord = SeqIO.read(gbFile, 'genbank')

#Print the header row
print ">Feature gb|%s|" % gbRecord.name

#Setup a pattern match to filter out "Geneious name:" lines
pattern = re.compile('Geneious name')

#Format and print each feature except the first one (it is summary data for the whole sequence)
for feature in gbRecord.features[1:]:
    if (len(feature.sub_features) > 0):  # Handle features with no subfeatures
        firstSubFeature = True
        orderedSubfeatures = feature.sub_features  #
        for subfeature in orderedSubfeatures:
            if (subfeature.strand == -1):  # reverse strand
                start = subfeature.location.nofuzzy_end
                stop = subfeature.location.nofuzzy_start + 1  # adjust for the python 0-index
Exemple #49
0
    if seq not in cache:
        # iterate over range by 2's as we don't want odd lengths
        tmp = []
        for k in range(1, len(seq), 2):
            ''' Multiply first half of the string * the first nt and ending nt of first half
            * second half
            This multiplication is to combine the number of noncrossing
            perfect matches from the subproblems.

            The actual value/counts comes from the dynamically generated dictionary.
            '''
            tmp.append(countRNA2Structures(seq[1:k]) * cache[seq[0]+seq[k]] * countRNA2Structures(seq[k+1:]))
        # assign current sequence into dictionary for later use
        cache[seq] = sum(tmp)
    return cache[seq]


if __name__ == "__main__":
    from Bio import SeqIO
    f = open("/Rosalind/data/rosalind_cat.txt", 'r')
    raw = SeqIO.read(f, "fasta")
    f.close()

    rna = str(raw.seq)

    # set up initial dictionary for number of matches for the sequence
    cache = {'':1, 'A':0, 'C':0, 'G':0, 'U':0, 'AA':0, 'AC':0, 'AG':0, 'AU':1, 'CA':0, 'CC':0,
             'CG':1, 'CU':0, 'GA':0, 'GC':1, 'GG':0, 'GU':0, 'UA':1, 'UC':0, 'UG':0, 'UU':0}

    print countRNA2Structures(rna) % 10**6
Exemple #50
0
print('Resetting ClusterCAD database.')
[cluster.delete() for cluster in pks.models.Cluster.objects.all()]
print('ClusterCAD database reset.')

# Assumes that chemical structures have already been aggregated
allknowncompounds = pickle.load(
    open('./data/compounds/all_known_products.p', 'rb'))

#for accession in ['BGC0000031']: # Debug with Borreledin
for accession in mibigaccessions:
    # Use accession number to get paths to MIBiG and antiSMASH files
    mibigfile = os.path.join(mibigpath, accession + '.json')
    clusterfile = os.path.join(antismashpath, accession + '.embl')

    # Read antiSMASH annotations for cluster
    record = SeqIO.read(clusterfile, "embl")
    description = record.description.replace(' biosynthetic gene cluster', '')

    # Get compound information
    try:
        compound = allknowncompounds[accession]
    # If compound is missing, we skip the cluster
    except KeyError:
        print('Missing compound %s: %s.' % (accession, description))
        continue
    knownproductsmiles = compound[0][0]
    knownproductsource = compound[1]

    # Enter information in ClusterCAD database
    try:
        cluster = pks.models.Cluster(
    sys.exit(0)

table_f, need_table, out_f = sys.argv[1:4]

fh_in = open(table_f, 'r')
fh_out = open(out_f, 'w')
for gi_l in fh_in:
    gi_l = gi_l.rstrip()
    gi, table = gi_l.split("\t")
    if table != need_table:
        continue

    Entrez.email = "*****@*****.**"
    handle = Entrez.efetch(db="nucleotide",
                           rettype="gb",
                           retmote="text",
                           id=gi)
    seq_record = SeqIO.read(handle, "gb")
    handle.close()

    line = ""
    line = seq_record.annotations['taxonomy']
    line = "\t".join(line)
    print(gi, line, sep="\t", file=fh_out)
    # do not post requests more than 3 times per second,
    # or your IP will be blocked by NCBI!!
    time.sleep(0.5)

fh_in.close()
fh_out.close()
def process_fast5(
    oper,
    db,
    connection_pool,
    args,
    ref_fasta_hash,
    dbcheckhash,
    filepath,
    hdf,
    dbname,
    cursor,
):

    try:
        checksum = hashlib.md5(open(filepath, 'rb').read()).hexdigest()
    except:
        err_string = "process_fast5(): error checksum ", filepath
        print >> sys.stderr, err_string
        sys.exit()

    # print checksum, type(checksum)
    # ## find the right basecall_2D location, get configuaration genral data, and define the basename.
    """basecalltype = 'Basecall_1D_CDNA'
    basecalltype2 = 'Basecall_2D'
    basecalldir = ''
    basecalldirconfig = ''

    # print "REF", ref_fasta_hash

    for x in range(0, 9):
        string = '/Analyses/%s_00%s/Configuration/general' \
            % (basecalltype, x)
        if string in hdf:
            basecalldir = '/Analyses/%s_00%s/' % (basecalltype, x)
            basecalldirconfig = string
            break
        string = '/Analyses/%s_00%s/Configuration/general' \
            % (basecalltype2, x)
        if string in hdf:
            basecalldir = '/Analyses/%s_00%s/' % (basecalltype2, x)
            basecalldirconfig = string
            break
    """

    file_type = check_read_type(filepath, hdf)
    #print "FILETYPE is", file_type

    if file_type == 2:
        basecalltype = "Basecall_1D"  #ML
        basecalltype2 = "Basecall_2D"
        basecalldir = ''
        basecalldirconfig = ''
        basecallindexpos = ''  #ML
        string2 = ''  #ML
        for x in range(0, 9):
            string2 = '/Analyses/Hairpin_Split_00%s/Configuration/general' % (
                x)  #ML
            if (string2 in hdf):
                basecallindexpos = x  #ml
                #print "BASECALLINDEXPOS",basecallindexpos
                basecalldirconfig = string2  #ML

        string = '/Analyses/%s_00%s/Configuration/general' % (basecalltype,
                                                              basecallindexpos)
        #print string
        if (string in hdf):
            #    print "YES 1"
            basecalldir = '/Analyses/%s_00%s/' % (basecalltype,
                                                  basecallindexpos)
            #basecallindexpos=x #ml
            #break

        string = '/Analyses/%s_00%s/Configuration/general' % (basecalltype2,
                                                              basecallindexpos)
        #print string
        if (string2 in hdf):
            #print "YES 2"
            basecalldir = '/Analyses/%s_00%s/' % (basecalltype2,
                                                  basecallindexpos)
            #basecalldirconfig=string2 #ML
            #break
    if file_type == 1:
        basecalltype = 'Basecall_1D_CDNA'
        basecalltype2 = 'Basecall_2D'
        basecalldir = ''
        basecalldirconfig = ''
        basecallindexpos = ''
        for x in range(0, 9):
            string = '/Analyses/%s_00%s/Configuration/general' \
                % (basecalltype, x)
            if string in hdf:
                basecalldir = '/Analyses/%s_00%s/' % (basecalltype, x)
                basecalldirconfig = string
                basecallindexpos = x
                break
            string = '/Analyses/%s_00%s/Configuration/general' \
                % (basecalltype2, x)
            if string in hdf:
                basecalldir = '/Analyses/%s_00%s/' % (basecalltype2, x)
                basecalldirconfig = string
                basecallindexpos = x
                break

    configdata = hdf[basecalldirconfig]
    basename = configdata.attrs[
        'basename']  # = PLSP57501_17062014lambda_3216_1_ch101_file10_strand

    # # get all the tracking_id data, make primary entry for basename, and get basenameid

    tracking_id_fields = [
        'basename',
        'asic_id',
        'asic_id_17',
        'asic_id_eeprom',
        'asic_temp',
        'device_id',
        'exp_script_purpose',
        'exp_script_name',
        'exp_start_time',
        'flow_cell_id',
        'heatsink_temp',
        'hostname',
        'run_id',
        'version_name',
    ]
    tracking_id_hash = make_hdf5_object_attr_hash(
        args, hdf['/UniqueGlobalKey/tracking_id'], tracking_id_fields)
    tracking_id_hash.update({
        'basename': basename,
        'file_path': filepath,
        'md5sum': checksum
    })
    hdf5object = hdf['/UniqueGlobalKey/channel_id']

    # print "Got event location"

    for x in ('channel_number', 'digitisation', 'offset', 'sampling_rate'):
        if x in hdf5object.attrs.keys():
            value = str(hdf5object.attrs[x])

            # print x, value

            tracking_id_hash.update({x: value})

    # range is a specifal case:
    # for x in ('range'):
    #    if (x in hdf5object.attrs.keys() ):
    #        value=str(hdf5object.attrs[x])
    #        print x, value
    #        tracking_id_hash.update({'range_val ':value})

    passcheck = 0
    if '/pass/' in filepath:
        passcheck = 1
    if '\\pass\\' in filepath:
        passcheck = 1
    tracking_id_hash.update({'pass': passcheck})
    basenameid = mysql_load_from_hashes(db, cursor, 'tracking_id',
                                        tracking_id_hash)

    # # get all the data from Configuration/general, then add Event Detection mux pore number

    general_fields = [
        'basename',
        'local_folder',
        'workflow_script',
        'workflow_name',
        'read_id',
        'use_local',
        'tag',
        'model_path',
        'complement_model',
        'max_events',
        'input',
        'min_events',
        'config',
        'template_model',
        'channel',
        'metrichor_version',
        'metrichor_time_stamp',
    ]
    general_hash = make_hdf5_object_attr_hash(args, configdata, general_fields)
    general_hash.update({'basename_id': basenameid})
    if (len(basecalldir) > 0):  #ML
        metrichor_info = hdf[basecalldir]  #ML
        try:
            general_hash.update({
                'metrichor_version':
                metrichor_info.attrs['chimaera version'],
                'metrichor_time_stamp':
                metrichor_info.attrs['time_stamp']
            })  #ML
        except:
            general_hash.update({
                'metrichor_version':
                metrichor_info.attrs['version'],
                'metrichor_time_stamp':
                metrichor_info.attrs['time_stamp']
            })  #ML
    else:  #ML
        general_hash.update({
            'metrichor_version': 'N/A',
            'metrichor_time_stamp': ''
        })  #ML

    # # get event detection for the read; define mux pore nuber

    eventdectionreadstring = \
        '/Analyses/EventDetection_000/Reads/Read_%s' \
        % general_hash['read_id']
    if eventdectionreadstring in hdf:
        hdf5object = hdf[eventdectionreadstring]

        # print "Got event location"

        for x in (
                'start_mux',
                'end_mux',
                'abasic_event_index',
                'abasic_found',
                'abasic_peak_height',
                'duration',
                'hairpin_event_index',
                'hairpin_found',
                'hairpin_peak_height',
                'hairpin_polyt_level',
                'median_before',
                'read_number',
                'scaling_used',
                'start_time',
        ):
            if x in hdf5object.attrs.keys():
                value = str(hdf5object.attrs[x])

                # print x, value

                general_hash.update({x: value})

        # Specific to catch read_id as different class:

        for x in 'read_id':
            if x in hdf5object.attrs.keys():
                value = str(hdf5object.attrs[x])

                # print 'read_name', value

                general_hash.update({'read_name': value})

        # Add pass flag to general_hash

        general_hash.update({'pass': passcheck})
        general_hash.update(
            {'exp_start_time': tracking_id_hash['exp_start_time']})
        general_hash.update({
            '1minwin':
            int(hdf5object.attrs['start_time'] /
                float(tracking_id_hash['sampling_rate']) / 60)
        })  # '1minwin':int(template_start/(60))
        general_hash.update({
            '5minwin':
            int(hdf5object.attrs['start_time'] /
                float(tracking_id_hash['sampling_rate']) / 60 / 5)
        })  # '1minwin':int(template_start/(60))
        general_hash.update({
            '10minwin':
            int(hdf5object.attrs['start_time'] /
                float(tracking_id_hash['sampling_rate']) / 60 / 10)
        })  # '1minwin':int(template_start/(60))
        general_hash.update({
            '15minwin':
            int(hdf5object.attrs['start_time'] /
                float(tracking_id_hash['sampling_rate']) / 60 / 15)
        })  # '1minwin':int(template_start/(60))

        # if ('start_mux' in hdf5object.attrs.keys() ):
        #    start_mux=str(hdf5object.attrs['start_mux'])
        # print "start_mux", start_mux
        #    general_hash.update({'start_mux':start_mux})
        # if ('end_mux' in hdf5object.attrs.keys() ):
        #    stop_mux=str(hdf5object.attrs['end_mux'])
        # print "stop_mux", stop_mux
        #    general_hash.update({'end_mux':stop_mux})

    # ## load general_hash into mysql

    mysql_load_from_hashes(db, cursor, 'config_general', general_hash)

    # # get all the basecall summary split hairpin data

    basecall_summary_fields = [
        'abasic_dur',
        'abasic_index',
        'abasic_peak',
        'duration_comp',
        'duration_temp',
        'end_index_comp',
        'end_index_temp',
        'hairpin_abasics',
        'hairpin_dur',
        'hairpin_events',
        'hairpin_peak',
        'median_level_comp',
        'median_level_temp',
        'median_sd_comp',
        'median_sd_temp',
        'num_comp',
        'num_events',
        'num_temp',
        'pt_level',
        'range_comp',
        'range_temp',
        'split_index',
        'start_index_comp',
        'start_index_temp',
    ]
    if file_type == 1:
        basecall_summary_hash = make_hdf5_object_attr_hash(
            args, hdf[basecalldir + 'Summary/split_hairpin'],
            basecall_summary_fields)
    if file_type == 2:
        basecall_summary_hash = make_hdf5_object_attr_hash(
            args, hdf['/Analyses/Hairpin_Split_00' + str(basecallindexpos) +
                      '/Summary/split_hairpin'], basecall_summary_fields)
    #print '/Analyses/Hairpin_Split_00'+str(basecallindexpos)+'/Summary/split_hairpin'
    #print basecall_summary_hash
    # # adding info about other the basecalling itself

    if basecalldir + 'Summary/basecall_1d_complement' in hdf:
        hdf5object = hdf[basecalldir + 'Summary/basecall_1d_complement']

        # print "Got event location"

        for x in (
                'drift',
                'mean_qscore',
                'num_skips',
                'num_stays',
                'scale',
                'scale_sd',
                'sequence_length',
                'shift',
                'strand_score',
                'var',
                'var_sd',
        ):
            if x in hdf5object.attrs.keys():
                value = str(hdf5object.attrs[x])

                # print x, value

                basecall_summary_hash.update({x + 'C': value})

    # # adding info about other the basecalling itself

    if basecalldir + 'Summary/basecall_1d_template' in hdf:
        hdf5object = hdf[basecalldir + 'Summary/basecall_1d_template']

        # print "Got event location"

        for x in (
                'drift',
                'mean_qscore',
                'num_skips',
                'num_stays',
                'scale',
                'scale_sd',
                'sequence_length',
                'shift',
                'strand_score',
                'var',
                'var_sd',
        ):
            if x in hdf5object.attrs.keys():
                value = str(hdf5object.attrs[x])

                # print x, value

                basecall_summary_hash.update({x + 'T': value})

    if basecalldir + 'Summary/basecall_2d' in hdf:
        hdf5object = hdf[basecalldir + 'Summary/basecall_2d']

        # print "Got event location"

        for x in ('mean_qscore', 'sequence_length'):
            if x in hdf5object.attrs.keys():
                value = str(hdf5object.attrs[x])

                # print x, value

                basecall_summary_hash.update({x + '2': value})

    # # Adding key indexes and time stamps

    basecall_summary_hash.update({'basename_id': basenameid})
    basecall_summary_hash.update({'pass': passcheck})
    basecall_summary_hash.update(
        {'exp_start_time': tracking_id_hash['exp_start_time']})
    basecall_summary_hash.update({'1minwin': general_hash['1minwin']})
    basecall_summary_hash.update({'5minwin': general_hash['5minwin']})
    basecall_summary_hash.update({'10minwin': general_hash['10minwin']})
    basecall_summary_hash.update({'15minwin': general_hash['15minwin']})

    # print basecall_summary_hash

    # # load basecall summary hash into mysql

    mysql_load_from_hashes(db, cursor, 'basecall_summary',
                           basecall_summary_hash)

    # # see if there is any barcoding info to addd

    barcode_hash = dict()
    for x in range(0, 9):
        string = '/Analyses/Barcoding_00%s/Summary/barcoding' % x

        # print string

        if string in hdf:

            # print "barcode", string

            barcode_hash = make_hdf5_object_attr_hash(args, hdf[string], (
                'pos0_start',
                'score',
                'design',
                'pos1_end',
                'pos0_end',
                'pos1_start',
                'variant',
                'barcode_arrangement',
            ))
            barcode_hash.update({'basename_id': basenameid})
            mysql_load_from_hashes(db, cursor, 'barcode_assignment',
                                   barcode_hash)

            # print barcode_hash
            # for bk in barcode_hash.keys():
            #    print bk, barcode_hash[bk], type(barcode_hash[bk])

            break

    # ------------ Do model details -------------------

    if args.telem is True:
        if dbname not in dbcheckhash['modelcheck']:
            dbcheckhash['modelcheck'][dbname] = dict()

        log_string = basecalldir + 'Log'
        if log_string in hdf:
            log_data = str(hdf[log_string][()])

            # print type(log), log

            lines = log_data.split('\n')
            template_model = None
            complement_model = None
            for l in lines:
                t = re.match('.*Selected model: "(.*template.*)".', l)
                if t:
                    template_model = t.group(1)
                c = re.match('.*Selected model: "(.*complement.*)".', l)
                if c:
                    complement_model = c.group(1)

            if template_model is not None:
                sql = \
                    "INSERT INTO %s (basename_id,template_model,complement_model) VALUES ('%s','%s',NULL)" \
                    % ('model_list', basenameid, template_model)
                if template_model not in dbcheckhash['modelcheck'][dbname]:
                    location = basecalldir + 'BaseCalled_template/Model'
                    if location in hdf:
                        upload_model_data('model_data', template_model,
                                          location, hdf, cursor, db)
                        dbcheckhash['modelcheck'][dbname][template_model] = 1

                if complement_model is not None:
                    sql = \
                        "INSERT INTO %s (basename_id,template_model,complement_model) VALUES ('%s','%s','%s')" \
                        % ('model_list', basenameid, template_model,
                           complement_model)
                    if complement_model not in dbcheckhash['modelcheck'][
                            dbname]:
                        location = basecalldir \
                            + 'BaseCalled_complement/Model'
                        if location in hdf:
                            upload_model_data('model_data', complement_model,
                                              location, hdf, cursor, db)
                            dbcheckhash['modelcheck'][dbname][
                                complement_model] = 1

                cursor.execute(sql)
                db.commit()

    # ---------------------------------------------------------------------------
    if file_type == 1:
        readtypes = {'basecalled_template': basecalldir \
                    + 'BaseCalled_template/',
                    'basecalled_complement': basecalldir \
                    + 'BaseCalled_complement/',
                    'basecalled_2d': basecalldir + 'BaseCalled_2D/'}
    if file_type == 2:
        readtypes = {
            'basecalled_template':
            '/Analyses/Basecall_1D_00' + str(basecallindexpos) + "/" +
            'BaseCalled_template/',
            'basecalled_complement':
            '/Analyses/Basecall_1D_00' + str(basecallindexpos) + "/" +
            'BaseCalled_complement/',
            'basecalled_2d':
            '/Analyses/Basecall_2D_00' + str(basecallindexpos) + "/" +
            'BaseCalled_2D/'
        }  #ML

    fastqhash = dict()

    # tel_sql_list=list()

    tel_data_hash = dict()
    template_start = 0
    for (readtype, location) in readtypes.iteritems():
        if location in hdf:
            fastq = hdf[location + 'Fastq'][()]
            try:
                rec = SeqIO.read(StringIO(fastq), 'fastq')
            except Exception, err:
                err_string = \
                    '%s:\tError reading fastq oject from base: %s type: %s error: %s' \
                    % (time.strftime('%Y-%m-%d %H:%M:%S'), basename,
                       readtype, err)
                print >> sys.stderr, err_string
                with open(dbcheckhash['logfile'][dbname], 'a') as \
                    logfilehandle:
                    logfilehandle.write(err_string + os.linesep)
                    logfilehandle.close()
                continue

            sequence = str(rec.seq)
            seqlen = len(sequence)
            rec.id = basename + '.' + readtype

            qual = chr_convert_array(db,
                                     rec.letter_annotations['phred_quality'])
            fastqhash[rec.id] = \
                {'quals': rec.letter_annotations['phred_quality'],
                 'seq': sequence}

            if location + 'Alignment' in hdf:  # so its 2D

                # print "we're looking at a 2D read",template_start,"\n\n"

                mysql_load_from_hashes(
                    db, cursor, readtype, {
                        'basename_id': basenameid,
                        'seqid': rec.id,
                        'sequence': sequence,
                        'qual': qual,
                        'start_time': template_start,
                        'seqlen': seqlen,
                        'exp_start_time': tracking_id_hash['exp_start_time'],
                        '1minwin': int(template_start / 60),
                        '5minwin': int(template_start / (5 * 60)),
                        '10minwin': int(template_start / (10 * 60)),
                        '15minwin': int(template_start / (15 * 60)),
                        'pass': passcheck,
                    })
                if args.telem is True:
                    alignment = hdf[location + 'Alignment'][()]

                    # print "ALIGNMENT", type(alignment)

                    channel = general_hash['channel'][-1]
                    tel_data_hash[readtype] = [basenameid, channel, alignment]

                    # upload_2dalignment_data(basenameid,channel,alignment,db)
                    # tel_sql_list.append(t_sql)

            complement_and_template_fields = [
                'basename',
                'seqid',
                'duration',
                'start_time',
                'scale',
                'shift',
                'gross_shift',
                'drift',
                'scale_sd',
                'var_sd',
                'var',
                'sequence',
                'qual',
            ]
            if location + 'Events' in hdf and location + 'Model' in hdf:  # so its either template or complement
                events_hash = make_hdf5_object_attr_hash(
                    args, hdf[location + 'Events'],
                    complement_and_template_fields)
                model_hash = make_hdf5_object_attr_hash(
                    args, hdf[location + 'Model'],
                    complement_and_template_fields)

                # #Logging the start time of a template read to pass to the 2d read in order to speed up mysql processing

                if readtype == 'basecalled_template':
                    template_start = events_hash['start_time']
                events_hash.update(model_hash)
                events_hash.update({
                    'basename_id':
                    basenameid,
                    'seqid':
                    rec.id,
                    'sequence':
                    sequence,
                    'qual':
                    qual,
                    'seqlen':
                    seqlen,
                    '1minwin':
                    int(events_hash['start_time'] / 60),
                    '5minwin':
                    int(events_hash['start_time'] / (5 * 60)),
                    '10minwin':
                    int(events_hash['start_time'] / (10 * 60)),
                    '15minwin':
                    int(events_hash['start_time'] / (15 * 60)),
                })
                events_hash.update({
                    'exp_start_time':
                    tracking_id_hash['exp_start_time'],
                    'pass':
                    passcheck
                })
                mysql_load_from_hashes(db, cursor, readtype, events_hash)

                # -------- This inserts telemetry data. It is optional under the flags above.
                # -------- Modified to calculate some means and averages
                # ------- so we are going to do this everytime
                # if (args.telem is True):
                # print "start telem",  (time.time())-starttime
                # ## Do Events

                events = hdf[location + 'Events'][()]
                tablechannel = readtype + '_' + general_hash['channel'][-1]
                tel_data_hash[readtype] = [basenameid, tablechannel, events]
    print(line)
    try:
        handle = Entrez.efetch(db="nucleotide",
                               id=str(line),
                               rettype="gb",
                               retmode="text")
    except urllib.error.HTTPError as exception:
        print('error with entrez connection, trying again')
        time.sleep(2)
        handle = Entrez.efetch(db="nucleotide",
                               id=str(line),
                               rettype="gb",
                               retmode="text")

    x = SeqIO.read(
        handle, 'genbank'
    )  # get information regarding your accesion number in here will be taxonomy
    tax = x.annotations['taxonomy']  # only get taxonomy
    taxf = ";".join(tax)  #join taxonomy based on ';' character
    full_lineage = (
        taxf + ';' + x.annotations['organism']
    )  # but i also want the organism name so this will also add organism specific name
    line = line.strip()
    lineage_info[line] = full_lineage

print("You have " + str(len(lineage_info)) + ' accesion numbers')
"""now open the fasta file containing the headers you want to change. This is going to be done on my viral db which 
was downloaded from ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/ . This downloaded dataset contains fasta files and 
header is AccesionNumber and Organism name. I want to change it to AccesionNumber and Taxonomic lineage. You can remove
the accesion number and add to a new file using the following commands:
sed 's/\s.*$//' viral_all.fna | grep ">" | sed 's/>//' > Viral_accesion_numbers.txt
Exemple #54
0
from Bio import SeqIO

fw=open(sys.argv[2],"w")
fw.write('Gene_ID\t'+'A(bases)\t'+'C(bases)\t'+'G(bases)\t'+'T(bases)'+'\t'+'N(bases)'+'\t'+'N_percentage(%)''\n')
biggestN={}
recordall={}
with open(sys.argv[1]) as IN:
        for seq_record in SeqIO.parse(IN, "fasta"):
                #print('>'+seq_record.id+' '+str(len(seq_record)))
                #print(str(seq_record.seq))
                #SeqIO.write(seq_record, fw, "fasta")
                Tempcount=[]
                #print(seq_record.seq.upper().count('A'))
                for nt in ('A','T','C','G','N'):
                        Tempcount.append(seq_record.seq.upper().count(nt))
                totallength=len(seq_record.seq)
                N_percentage=round((Tempcount[-1]/int(totallength))*100.0,2)
                Tempcount.append(N_percentage)
                line='\t'.join(map(str, Tempcount))
                result_count=seq_record.id+'\t'+line
                biggestN[seq_record.id]=Tempcount[-2]
                recordall[seq_record.id]=result_count
        for key,value in sorted(biggestN.items(),key=lambda x:x[1],reverse=True):
                if key in recordall.keys():
                        print >>fw,"{}".format(recordall[key])
fw.close()
'''
record = SeqIO.read(sys.argv[1], "fasta")
print('>'+record.id)
print(record.seq)
Exemple #55
0
aln_fname = '../data/' + flutype + '_HA1_all_years_filtered.fasta.gz'

if flutype.startswith('H3N2'):
    cds = {'begin': 0, 'end': 987, 'pad': 0}
else:
    cds = {'begin': 0, 'end': 300, 'pad': 0}

if os.path.isfile('../data/' + flutype + '_L_L_predictions.pickle'):
    with open('../data/' + flutype + '_L_L_predictions.pickle') as infile:
        laessig_prediction = pickle.load(infile)

# open annotations file
with open('../data/' + flutype + '_annotations.pickle', 'r') as infile:
    annotation = pickle.load(infile)
outgroup = SeqIO.read('../data/' + flutype + '_outgroup.fasta', 'fasta')

bin_dt = 105  #time bins in days. 3*105 = 315 days approx 10 month
years = range(1995, 2012)
predictions = {}
for year in years:
    if "oceania" in test_regions:
        prediction_set = {
            'start': date(year - 2, 10, 1),
            'stop': date(year - 1, 9, 30),
            'regions': prediction_regions,
            'sample_size': sample_size
        }
        test_set = {
            'start': date(year, 3, 1),
            'stop': date(year, 9, 30),
	def __init__(self,min_length = 900, **kwargs):
		'''
		parameters
		min_length  -- minimal length for a sequence to be acceptable
		'''
		flu_filter.__init__(self, **kwargs)
		self.min_length = min_length
		self.vaccine_strains =[
				{
					"strain": "A/Wisconsin/67/2005",
					"db": "IRD",
					"accession": "CY163984",
					"date": "2005-08-31",
					"region": "north_america",
					"country": "usa",
					"seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCAATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAGAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA"
				},	{
					"strain": "A/Brisbane/10/2007",
					"db": "IRD",
					"accession": "CY113005",
					"date": "2007-02-06",
					"region": "oceania",
					"country": "australia",
					"seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACCAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACAATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA"
				},	{
					"strain": "A/Perth/16/2009",
					"db": "IRD",
					"accession": "GQ293081",
					"date": "2009-04-07",
					"region": "oceania",
					"country": "australia",
					"seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTTCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA"
				},	{
					"strain": "A/Victoria/361/2011",
					"db": "IRD",
					"accession": "GQ293081",
					"date": "2011-10-24",
					"region": "oceania",
					"country": "australia",					
					"seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTAAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA"
				},	{
					"strain": "A/Texas/50/2012",
					"db": "GISAID",
					"isolate_id": "EPI_ISL_129858",
					"date": "2012-04-15",
					"region": "north_america",
					"country": "usa",					
					"seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA",
				},	{
					"strain": "A/Switzerland/9715293/2013",
					"db": "GISAID",
					"isolate_id": "EPI_ISL_162149",
					"date": "2013-12-06",
					"region": "europe",
					"country": "switzerland",
					"seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAGACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGCTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACAAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA",
				},  {
					"strain": "A/HongKong/4801/2014",
					"db": "GISAID",
					"isolate_id": "EPI_ISL_165554",
					"date": "2014-02-26",
					"region": "china",
					"country": "hong_kong",
					"seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGAAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGGAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA",
				},  {
					"strain": "A/Alaska/232/2015",
					"db": "GISAID",
					"isolate_id": "EPI787411",
					"date": "2015-09-09",
					"region": "north_america",
					"country": "usa",
					"seq": "GGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAGAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACAAATATCCAGCATTGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTACCCGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGATGCAACATTTGCATTTGAGTGCATTAATTAAAAACAC"
				}
			]
		tmp_outgroup = SeqIO.read('source-data/H3N2_outgroup.gb', 'genbank')
		genome_annotation = tmp_outgroup.features
		self.cds = {x.qualifiers['gene'][0]:x for x in genome_annotation
				if 'gene' in x.qualifiers and x.type=='CDS' and
				x.qualifiers['gene'][0] in ['SigPep', 'HA1', 'HA2']}
		self.outgroup = {
			'strain': 'A/Beijing/32/1992',
			'db': 'IRD',
			'accession': 'U26830',
			'date': '1992-01-01',
			'country': 'china',
			'region': 'china',
			'seq': str(tmp_outgroup.seq).upper()
		}
Exemple #57
0
    def get_raw_check(self, filename, format, alphabet, comp):
        #Also checking the key_function here
        if comp:
            h = gzip.open(filename, "rb")
            raw_file = h.read()
            h.close()
            h = gzip_open(filename, format)
            id_list = [rec.id.lower() for rec in
                       SeqIO.parse(h, format, alphabet)]
            h.close()
        else:
            h = open(filename, "rb")
            raw_file = h.read()
            h.close()
            id_list = [rec.id.lower() for rec in
                       SeqIO.parse(filename, format, alphabet)]

        if format in ["sff"]:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', BiopythonParserWarning)
                rec_dict = SeqIO.index(filename, format, alphabet,
                                       key_function = lambda x : x.lower())
        else:
            rec_dict = SeqIO.index(filename, format, alphabet,
                                   key_function = lambda x : x.lower())

        self.assertEqual(set(id_list), set(rec_dict))
        self.assertEqual(len(id_list), len(rec_dict))
        for key in id_list:
            self.assertTrue(key in rec_dict)
            self.assertEqual(key, rec_dict[key].id.lower())
            self.assertEqual(key, rec_dict.get(key).id.lower())
            raw = rec_dict.get_raw(key)
            self.assertTrue(raw.strip())
            self.assertTrue(raw in raw_file)
            rec1 = rec_dict[key]
            #Following isn't very elegant, but it lets me test the
            #__getitem__ SFF code is working.
            if format in SeqIO._BinaryFormats:
                handle = BytesIO(raw)
            else:
                handle = StringIO(_bytes_to_string(raw))
            if format == "sff":
                rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                            rec_dict._proxy._flows_per_read,
                            rec_dict._proxy._flow_chars,
                            rec_dict._proxy._key_sequence,
                            rec_dict._proxy._alphabet,
                            trim=False)
            elif format == "sff-trim":
                rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                            rec_dict._proxy._flows_per_read,
                            rec_dict._proxy._flow_chars,
                            rec_dict._proxy._key_sequence,
                            rec_dict._proxy._alphabet,
                            trim=True)
            elif format == "uniprot-xml":
                self.assertTrue(raw.startswith(_as_bytes("<entry ")))
                self.assertTrue(raw.endswith(_as_bytes("</entry>")))
                #Currently the __getitem__ method uses this
                #trick too, but we hope to fix that later
                raw = """<?xml version='1.0' encoding='UTF-8'?>
                <uniprot xmlns="http://uniprot.org/uniprot"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                xsi:schemaLocation="http://uniprot.org/uniprot
                http://www.uniprot.org/support/docs/uniprot.xsd">
                %s
                </uniprot>
                """ % _bytes_to_string(raw)
                handle = StringIO(raw)
                rec2 = SeqIO.read(handle, format, alphabet)
            else:
                rec2 = SeqIO.read(handle, format, alphabet)
            self.assertEqual(True, compare_record(rec1, rec2))
        rec_dict.close()
        del rec_dict
Exemple #58
0
# Only doing a_vs_b here, could also have b_vs_c and c_vs_d etc
genomes = [
    (os.path.join(input_folder, file_a), format_a),
    (os.path.join(input_folder, file_b), format_b),
]
comparisons = [os.path.join(input_folder, file_a_vs_b)]

# Create diagram with tracks, each with a feature set
assert len(genomes) >= 2 and len(genomes) == len(comparisons) + 1
gd_diagram = Diagram(name, track_size=0.35, circular=False)
tracks = dict()
feature_sets = dict()
records = dict()
for f, format in genomes:
    records[f] = SeqIO.read(f, format)
    tracks[f] = gd_diagram.new_track(1,
                                     name=f,
                                     start=0,
                                     end=len(records[f]),
                                     scale_smalltick_interval=1000,
                                     scale_largetick_interval=10000,
                                     greytrack=True,
                                     greytrack_labels=0)
    feature_sets[f] = tracks[f].new_set()

print("Drawing matches...")
for i, crunch_file in enumerate(comparisons):
    q = genomes[i + 1][0]  # query file
    s = genomes[i][0]  # subject file
    q_set = feature_sets[q]
def find_homologs():
    """Predict homologs of PPRs in other genomes based on footprints"""
    pprs = load_pprs()
    plastids = load_plastids(exclude=[
        "Arabidopsis thaliana",
    ])
    known_binding = SeqIO.read("output/ARA_annotated.gb", "gb")
    exact_features = [
        f for f in known_binding.features if "exact" in f.type.lower()
    ]
    ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"]
    ara_genes.sort(key=lambda g: g.location.start)

    print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids))

    for k, ppr in enumerate(pprs):
        print "Searching for homologs of \'{}\' ({}/{})".format(
            ppr.name, k + 1, len(pprs))
        footprints = [
            f for f in exact_features
            if f.type.lower() == "{}_exact".format(ppr.name.lower())
        ]
        ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints]

        print "\tFound {} original genes, {}".format(len(
            ppr.genes), [g.qualifiers['gene'] for g in ppr.genes])

        ppr.potentialHomologs = {}

        for i, plastid in enumerate(plastids):

            if plastid.name != "Alsophila spinulosa":
                continue

            print "\t\tSearch {}/{}".format(i + 1, len(plastids))

            #search for homologs of each gene
            homologs = []
            for gene in ppr.genes:
                g = SeqRecord(gene.extract(known_binding.seq).translate())
                search = HMMER.jackhmmer(g, plastid)
                print "{} -> {} homologs".format(gene.qualifiers['gene'],
                                                 len(search.matches))
                homologs += search.getFeatures(
                    type="{}_hl".format(gene.qualifiers['gene']))

            #extract the sequence surrounding each homolog
            for h in homologs:
                h.location = FeatureLocation(
                    max(0, h.location.start - 500),
                    min(len(plastid), h.location.end + 500))
            homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs]

            #find exact or close to exact binding domains for each and add to the
            #list of potential homologs for the PPR
            ph = []
            for h in homologs:
                domains = []
                for exact in ppr.exact:
                    try:
                        domains += binding.get_domains(exact,
                                                       h,
                                                       percentile=100.0,
                                                       gaps=0)
                    except KeyError:
                        continue
                if domains:
                    domains.sort(key=lambda d: -d.qualifiers['odds'])
                    seq = str(domains[0].extract(h).seq)
                    similarity = max([
                        sequence_similarity(original, seq)
                        for original in ppr.footprints
                    ])
                    print "  {} -> \'{}\'".format(h.type, seq)
                    ph.append((similarity, seq))

            ph.sort(key=lambda p: -p[0])
            ppr.potentialHomologs[plastid.name] = ph

            #try and avoid running out of RAM
            gc.collect()

    for ppr in pprs:
        print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints)
        print "potential homologs"
        for key, value in ppr.potentialHomologs.iteritems():
            print "{}: {}".format(key, value)

    return

    stats = []
    for plastid in plastids:
        length = 0
        similarity = 0.0
        for ppr in pprs:
            length += len(ppr.potentialHomologs[plastid.name])
            similarity += sum(
                [p[0] for p in ppr.potentialHomologs[plastid.name]])

        try:
            stats.append({
                'name': plastid.name,
                'avg_similarity': similarity / float(length),
                'avg_homologs': length / len(pprs),
            })
        except ZeroDivisionError:
            stats.append({
                'name': plastid.name,
                'avg_similarity': 0.0,
                'avg_homologs': 0,
            })

    stats.sort(key=lambda s: -s['avg_similarity'])

    f = open("tmp", "w")

    for s in stats[0:50]:
        f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s))
    f.close()
Exemple #60
0
def get_genomes(genome_id,
                genome_region,
                gene_ids,
                reverse_complement=True,
                entrez_mail='*****@*****.**',
                force=False):
    Entrez.email = entrez_mail
    chromosome, start, end = genome_region
    # NCBI uses 1 based indexing and closed intervals [a,b]
    handle = Entrez.efetch(db='nucleotide',
                           id=genome_id,
                           rettype='fasta',
                           strand=1,
                           seq_start=start + 1,
                           seq_stop=end + 1 + 1)
    record = SeqIO.read(handle, 'fasta')
    hg19 = record.seq

    genomes = {}
    handle = Entrez.read(
        Entrez.esearch(db='nucleotide',
                       term=' '.join(g[1] for g in gene_ids),
                       retmode='xml'))
    for gi, gid in enumerate(handle['IdList']):
        params = {}
        if len(gene_ids[gi]) > 2:
            params = gene_ids[gi][2]
        genome = Entrez.efetch(db='nucleotide',
                               id=gid,
                               rettype='gb',
                               retmode='text',
                               **params).read()
        genome = SeqIO.read(StringIO(genome), 'genbank')

        if reverse_complement:
            genome.seq = genome.seq.reverse_complement()
        alignment = blat(hg19, genome.seq)

        log.trace('NCBI: Gene {} BLAT results: hit {}, query {}', genome.id,
                  alignment.hit_range, alignment.query_range)
        translation = dict(
            (i[0], i[1] + start) for f in alignment
            for i in zip(range(*f.query_range), range(*f.hit_range)))
        cds = [c for c in genome.features if c.type == 'CDS']
        if len(cds) == 0:
            cds = [c for c in genome.features if c.type == 'misc_RNA']
        for cd in cds:
            protein = ''
            if 'translation' in cd.qualifiers:
                protein = cd.qualifiers['translation']

            if reverse_complement:
                exons = [
                    SeqFeature.FeatureLocation(
                        len(genome.seq) - e.end,
                        len(genome.seq) - e.start, 1)
                    for e in cd.location.parts
                ]
                introns = [
                    SeqFeature.FeatureLocation(e2.end, e1.start, 1)
                    for e1, e2 in zip(exons[:-1], exons[1:])
                ]
            else:
                exons = [
                    SeqFeature.FeatureLocation(e.start, e.end, 1)
                    for e in cd.location.parts
                ]
                introns = [
                    SeqFeature.FeatureLocation(e1.end, e2.start, 1)
                    for e1, e2 in zip(exons[:-1], exons[1:])
                ]

            genomes[cd.qualifiers['gene'][0]] = Gene(
                name=cd.qualifiers['gene'][0],
                protein=protein,
                introns=introns,
                exons=exons,
                seq=genome.seq,
                translation=translation,
                pseudo_mutations={},
                pseudo_translation={},
                special_regions={})

    if len(gene_ids) > 1:
        g, p = gene_ids[0][0], gene_ids[1][0]
        p, pt = get_pseudo_mutations(genomes[g], genomes[p], force)
        genomes[g].pseudo_mutations.update(p)
        genomes[g].pseudo_translation.update(pt)

    return genomes, hg19