Ejemplo n.º 1
0
def main():
    for afa_rec, fn_rec in \
        zip(parse(sys.argv[1], 'fasta'), parse(sys.argv[2], 'fasta')):
        assert afa_rec.id == fn_rec.id
        afn_str = backalign(afa_rec.seq, fn_rec.seq.ungap('-'))
        write(SeqRecord(Seq(afn_str), id=afa_rec.id, description=""),
              sys.stdout, 'fasta')
Ejemplo n.º 2
0
def retriving(a, b, c):
    pdbid = a
    chainid = b
    uniid = c
    my_record = []
    log = open('pdb.fasta', 'w')
    seqpy = Popen(["python", "pdb_seq.py", pdbid], stdout=PIPE, stderr=PIPE)
    stdout = seqpy.communicate()[0]
    log.write(stdout)
    wait = seqpy.wait()
    log.close()
    seqfile = open("pdb.fasta")
    for seq_record in parse(seqfile, "fasta"):
        r = seq_record.id.split('_')
        if r[0][-1] == chainid:
            my_record.append(seq_record)
    seqfile.close()
    url = 'https://www.uniprot.org/uniprot/' + uniid + '.fasta'
    seqfile2 = urlopen(url)
    for seq_record in parse(seqfile2, "fasta"):
        r = seq_record.id.split('|')
        uniprot = r[1]
        my_record.append(seq_record)
    seqfile2.close()
    write(my_record, "test.fasta", "fasta")
Ejemplo n.º 3
0
def trimByConstant(inPath1,
                   outPath1,
                   inPath2=None,
                   outPath2=None,
                   cEnd=20,
                   minLen=75,
                   cBegin=None,
                   outFormat="fasta",
                   inFormat="fastq",
                   stats=None):
    """Trim constant number of nucleotides from the end of each read in a 
    fastq file.
    
    If cBegin is given it will be used as the number of reads to cut from the 
    beginning of a read.
    """
    files = [parse(open(inPath1), inFormat)]
    if outPath1 is None:
        outFiles = [sys.stdout]
    else:
        outFiles = [open(outPath1, "w")]
    if inPath2:
        files.append(parse(open(path2), inFormat))
        outFiles.append(open(outPath2, "w"))
    return _trimNStreams(_trimReadByConstant, files, outFiles, outFormat,
                         stats, cEnd, cBegin, minLen)
Ejemplo n.º 4
0
def trimByQuality(path1,
                  outPath1,
                  path2=None,
                  outPath2=None,
                  minQual=20,
                  minLen=75,
                  bothEnds=True,
                  outFormat="fasta",
                  stats=None):
    """Trim fastq reads according to a quality threshold.
    
    If path two is given path1 and path2 should contain reads from read1 and 
    read2 of a paired end library.
    Reads with length below minLen after trimming will be discarded (together 
    with there mate if path2 is given).
    If bothEnds is true, nucleotides with quality below minQual will also be 
    removed from the begining of the read.
    """

    files = [parse(open(path1), "fastq")]
    if outPath1 is None:
        outFiles = [sys.stdout]
    else:
        outFiles = [open(outPath1, "w")]
    if path2:
        files.append(parse(open(path2), "fastq"))
        outFiles.append(open(outPath2, "w"))

    return _trimNStreams(_trimReadByQuality, files, outFiles, outFormat, stats,
                         minQual, minLen, bothEnds)
Ejemplo n.º 5
0
def build_kmer_df_learn(lp_fasta, l_label=None):
    from Bio.SeqIO import parse
    from itertools import chain
    from pandas import DataFrame, Series, concat

    l_kmer_size = [1, 2, 3]
    l_letter = ['M', 'F', 'L', 'I', 'V', 'P', 'T', 'A', 'Y', 'H', 'Q', 'N', 'K', 'D', 'E', 'C', 'R', 'S', 'W', 'G']
    l_kmer = list(chain(*[generate_kmer(kmer_size, l_letter, l_letter) for kmer_size in l_kmer_size]))
    l_l_kmer_freq = []
    l_seq_id = []
    for p_fasta in lp_fasta:
        for record in parse(p_fasta, 'fasta'):
            l_seq_id.append(record.id)
            d_record = {}
            seq = str(record.seq)
            len_seq = len(seq)
            for i in range(len_seq):
                for kmer in [seq[i:i + kmer_size] for kmer_size in l_kmer_size if i <= len_seq - kmer_size]:
                    d_record[kmer] = 1 if kmer not in d_record.keys() else d_record[kmer] + 1
            l_kmer_freq = []
            for kmer in l_kmer:
                l_kmer_freq.append(d_record[kmer] if kmer in d_record.keys() else 0)
            l_l_kmer_freq.append(l_kmer_freq)

    df_data = DataFrame(l_l_kmer_freq, columns=l_kmer, index=l_seq_id)

    if l_label:
        s_label = Series(name='label')
        for p_fasta, label in zip(lp_fasta, l_label):
            l_seq_id = [record.id for record in parse(p_fasta, 'fasta')]
            s_label = concat([s_label, Series(label, name='label', index=l_seq_id)])
        return df_data, s_label
    else:
        return df_data
Ejemplo n.º 6
0
def main(argv: Optional[List[str]] = None) -> int:
    parser = argument_parser()
    args = parser.parse_args(argv)

    try:
        query = parse(args.query, "fasta")
        reference = parse(args.reference, "fasta")
        print(orthoani(query, reference, threads=args.jobs))
        return 0

    except KeyboardInterrupt:
        print("Interrupted.", file=sys.stderr)
        return -signal.SIGINT

    except Exception as e:
        if args.traceback:
            print(
                "".join(
                    better_exceptions.format_exception(type(e), e,
                                                       e.__traceback__)),
                file=sys.stderr,
            )
        else:
            print(e, file=sys.stderr)
        return typing.cast(int, getattr(e, "errno", 1))
Ejemplo n.º 7
0
def removSeqsWithN(threshold, inStream1, outStream1, inStream2=None, 
                   outStream2=None, fileForm="fastq"):
    """Write all sequences with equal or less than threshold Ns to the 
outStream.
    
Optinal sequences with more Ns can be written to a different stream.
    """
    t=0
    n=0
    strIt1 = parse(inStream1, fileForm)
    if not inStream2 is None:
        strIt2 = parse(inStream2, fileForm)
    while True:
        t+=1
        try:
            r1 = strIt1.next()
        except StopIteration:
            break
        if threshold == -1:
            remove = r1.seq.count("N") == len(r1.seq)
        else:
            remove = r1.seq.count("N") > threshold
        if not inStream2 is None:
            r2 = strIt2.next()
            if threshold == -1:
                remove |= r2.seq.count("N") == len(r2.seq)
            else:
                remove |= r2.seq.count("N") > threshold
        if remove:
            n+=1
        else:
            outStream1.write(r1.format(fileForm))
            if not inStream2 is None:
                outStream2.write(r2.format(fileForm))
    return t, n
Ejemplo n.º 8
0
def retriving(a,b,c):
    pdbid     = a
    chainid   = b
    uniid     = c
    my_record = []
    log       = open('pdb.fasta','w')
    seqpy     = Popen(["python","pdb_seq.py",pdbid],stdout=PIPE,stderr=PIPE)
    stdout    = seqpy.communicate()[0]
    log.write(stdout)
    wait      = seqpy.wait()
    log.close()
    seqfile   = open("pdb.fasta")
    for seq_record in parse(seqfile, "fasta"):
        r = seq_record.id.split('_')
        if r[0][-1]==chainid:
            my_record.append(seq_record)
    seqfile.close()
    url = 'http://www.uniprot.org/uniprot/'+uniid+'.fasta'
    seqfile2 = urlopen(url)
    for seq_record in parse(seqfile2, "fasta"):
        r = seq_record.id.split('|')
        uniprot = r[1]
        my_record.append(seq_record)
    seqfile2.close()
    write(my_record, "test.fasta", "fasta")
Ejemplo n.º 9
0
def get_mgedb(mgepath):
    print("[-] Preparing mobile genetic element database")
    Path(mgepath).mkdir(parents=True, exist_ok=True)

    to_write = []

    for i in range(1, 15):
        file = fetch_url(
            f'https://raw.githubusercontent.com/katholt/'
            f'Kleborate/master/kleborate/data/ICEKp_references'
            f'/ICEKp{i}.embl', None, f'{mgepath}/icekp{i}')

        flist = open(file).readlines()
        parsing = False
        fasta = ''
        for line in flist:
            if line.startswith("//"):
                parsing = False
            if parsing:
                fasta += line.replace(" ", "").strip()
            if line.startswith("SQ"):
                parsing = True
        icekp = f'>ICEKp{i}\n' + ''.join([i for i in fasta if not i.isdigit()])
        for r in parse(StringIO(icekp), 'fasta'):
            to_write.append(r)
        remove(file)

    ice = fetch_url(
        'https://db-mml.sjtu.edu.cn/ICEberg2/download/ICE_seq_all.fas', None,
        mgepath + '/ice.fna')
    t4ss = fetch_url(
        'https://db-mml.sjtu.edu.cn/ICEberg2/download/T4SS-type_ICE_seq_all.fas',
        None, mgepath + '/t4ss.fna')
    aice = fetch_url(
        'https://db-mml.sjtu.edu.cn/ICEberg2/download/AICE_seq_all.fas', None,
        mgepath + '/aice.fna')
    ime = fetch_url(
        'https://db-mml.sjtu.edu.cn/ICEberg2/download/IME_seq_all.fas', None,
        mgepath + '/ime.fna')
    cime = fetch_url(
        'https://db-mml.sjtu.edu.cn/ICEberg2/download/CIME_seq_all.fas', None,
        mgepath + '/cime.fna')

    filenames = [ice, t4ss, aice, ime, cime]
    accessions = ['ICEKp1']
    for f in filenames:
        for r in parse(f, 'fasta'):
            r.id = r.id.split('|')[2]
            r.id = r.id.replace('[', '_')
            r.id = r.id.replace(']', '')
            if r.id not in accessions:
                accessions.append(r.id)
                to_write.append(r)
        remove(f)

    write(to_write, mgepath + '/mgedb', "fasta")

    return run_makeblastdb(mgepath + '/mgedb', 'nucl', f'{mgepath}/mgedb')
Ejemplo n.º 10
0
 def test_multiple_contig(self):
     record = parse(fspath(self.data / "NZ_AAEN01000029.fna"), "fasta")
     with (self.data / "NZ_AAEN01000029.fna.chopped.fasta").open() as f:
         expected = list(parse(f, "fasta"))
     with tempfile.NamedTemporaryFile(mode="rt", suffix=".fna") as tmp:
         orthoani._chop(record, tmp.name, 1020)
         actual = list(parse(tmp, "fasta"))
     for actual_record, expected_record in zip(actual, expected):
         self.assertEqual(actual_record.seq, expected_record.seq)
Ejemplo n.º 11
0
    def test_gibson_offtarget_primer2(self):
        """Create Gibson primers when there's offtarget in one's end (2)."""

        insert = next(parse(os.path.join(TEST_DIR, "BBa_K1649003.fa"),
                            "fasta"))
        backbone = next(parse(os.path.join(TEST_DIR, "pDusk.fa"), "fasta"))

        plasmid, primer_pairs = gibson([insert, backbone])

        self.assertTrue(plasmid and primer_pairs)
Ejemplo n.º 12
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    align_index = {rec.id: rec
                   for rec in parse(args.align_handle, args.fmt_align)}

    for rec in backalign_recs(parse(args.in_handle, args.fmt_infile),
                              align_index):
        write(rec, args.out_handle, args.fmt_outfile)
Ejemplo n.º 13
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    if args.match_order:
        for rec in get_recs(parse(args.in_handle, args.fmt_infile),
                            get_list(args.list_handle)):
            write(rec, args.out_handle, args.fmt_outfile)
    else:
        recs = get_rec_list(parse(args.in_handle, args.fmt_infile),
                            get_list(args.list_handle))
        write(recs, args.out_handle, args.fmt_outfile)
Ejemplo n.º 14
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    for rc_rec in revcompl_recs(parse(args.in_handle, args.fmt_infile)):
        write(rc_rec, args.out_handle, args.fmt_outfile)
def main():
    start_time = time()

    parser = cmd_parse()  # Parsing of command line arguments

    with open(parser['file']) as genome:
        fasta_genome = to_dict(parse(genome, 'fasta'))  # Reading genome file

    jobs = parser['jobs']  # Number of processes to parallelize
    fragments = parser['fragments_num']  # Number of fragments to get
    frags_per_core = [fragments // jobs] * (jobs - 1)
    frags_per_core.append(fragments - sum(frags_per_core))  # Number of fragment to get from one process

    dis_file = parser['dis_file']  # Address of empirical distribution file
    emp_dis = rfd(dis_file) if dis_file is not None else None  # Empirical distribution reading
    my_seed = parser['seed']  # Numpy seeding argument

    processes = []  # List of processes to parallelize
    for job, fragments_num in enumerate(frags_per_core):
        seeding = ((my_seed + job) % MAX_SEED if my_seed != -1 else my_seed) if my_seed is not None else None
        # Processing of seeding argument
        processes.append(Process(target=disassembler,
                                 args=(fasta_genome, parser['seq_type'], fragments_num, parser['out_file'],
                                       parser['depth'], parser['read_len'], job, seeding, emp_dis, parser['mean_len'])))
        processes[-1].start()
    for process in processes:
        process.join()
    print_verbose(
        'The program completed disassembling without any errors. Elapsed time={:f}'.format(time() - start_time),
        parser['session_id'], parser['logfile'], parser['verbose'], parser['params'])  # Parameters logging
Ejemplo n.º 16
0
def readFasta(inStream):
    """Read fasta file and save a table with sequence IDs and sequence length"""
    out = []
    for record in parse(inStream, "fasta"):
        out.append(
            (record.id, len(record), GC(record.seq), record.seq.count("N")))
    return out
Ejemplo n.º 17
0
def prep_database(locus_type, gene_type):
    from sys import exc_info

    # attempt to build local dicts
    try:
        # screen input arguments
        locus_type = str(locus_type).upper()
        gene = str(gene_type).upper()

        # pull location of corresponding ref_db
        ref_db_file = all_ref_dbs[locus_type][gene_type]

        # prep return
        seq_dict = {}
        type_dict = {}

        # parse fasta file to build local ref_dbs as seq_dict and type_dict
        for nt in parse(ref_db_file, "fasta"):
            nts = str(nt.seq).lower()
            p = nt.description.split('|')
            allele = p[1]
            gene_type = p[3]
            seq_dict[allele] = nts
            type_dict[allele] = gene_type

        return seq_dict, type_dict

    # handle incorrect number of arguments passed
    except TypeError:
        print(
            "Invalid input. prep_database() takes exactly 2 string inputs as arguments: locus_type and gene_type."
        )
        raise TypeError

    # handle invalid arguments passed
    except ValueError:
        print(
            "Invalid input. prep_database() takes 2 string inputs as arguments: locus_type and gene_type."
        )
        raise ValueError

    # handle missing file location for locus+gene lookup
    except KeyError:
        print("Mismatch between locus_type and gene_type.")
        print(
            "Specified gene_type may not be in given locus_type, or locus_type is not included in known options."
        )
        raise KeyError

    # handle missing ref_db file for locus+gene lookup
    except FileNotFoundError:
        print(
            "Reference file for locus type {locus_type} and gene_type {gene_type} was not found."
        )
        raise FileNotFoundError

    # handle unknown error
    except:
        print("Unexpected error:", exc_info()[0])
        raise
def getReadStats(inStream):
    readStats = {
        "rId": [],
        "lane": [],
        "tile": [],
        "x": [],
        "y": [],
        "qual": [],
        "n_count": [],
        "length": []
    }
    for rec in parse(inStream, "fastq"):
        #position
        idArr = rec.id.split(" ")[0].split(":")
        lane = int(idArr[3])
        tile = int(idArr[4])
        x = int(idArr[5])
        y = int(idArr[6])
        readStats["rId"].append(rec.id)
        readStats["lane"].append(lane)
        readStats["tile"].append(tile)
        readStats["x"].append(x)
        readStats["y"].append(y)
        #mean quality
        qual = float(sum(
            rec._per_letter_annotations["phred_quality"])) / len(rec)
        readStats["qual"].append(qual)
        #number of Ns
        nCount = rec.seq.count("N")
        readStats["n_count"].append(nCount)
        #length
        length = len(rec)
        readStats["length"].append(length)
    return readStats
Ejemplo n.º 19
0
def write_reads(readfile, reads, outfile, verbose):
    if not outfile:
        fh_out = sys.stdout
    else:
        fh_out = open(outfile, 'w')

    if ".gz" in readfile:
        fh_in = gz.open(readfile, 'rt')
    else:
        fh_in = open(readfile, 'r')
    written = 0
    if verbose:
        logging.info("Parsing reads in {}".format(readfile))
    for i, record in enumerate(parse(fh_in, "fastq"), start=1):
        if i % 100000 == 0 and verbose:
            logging.info("{} reads parsed, {} reads written...".format(
                i, written))
        id = record.id
        id_split = id.rsplit("/")[0]
        if id in reads or id_split in reads:
            fh_out.write("{}".format(record.format("fastq")))
            written += 1
    if verbose:
        logging.info("{} reads parsed, {} reads written...Done\n".format(
            i, written))
Ejemplo n.º 20
0
def filter_seqs_by_len(infile, outfile, minlen):
    """Filters sequences by length

    Parameters
    ----------
    infile: str
        Sequence file in fasta format
    outfile: str
        File in fasta format containing sequences longer than minlen
    minlen: int
        Minimum size of sequences to keep
    """

    from Bio.SeqIO import parse, write
    i = 0
    with open(outfile, 'w') as fh:
        for record in tqdm.tqdm(parse(infile, 'fasta'),
                                unit=" sequences",
                                ncols=100,
                                desc="Filtering sequences"):
            if len(record) >= minlen:
                write(record, fh, "fasta")
                i += 1
    sys.stderr.write("{} sequences longer than {} written to {}\n".format(
        i, minlen, outfile))
Ejemplo n.º 21
0
def percentages_from_proteins(path):
    file=open(path)
    names_list=[]
    sequence_list=[]
    sources_list = []
    desc_list = []
    taxo_list = []
    keyw_list = []
    taxid_list = []
    for record in  parse(file, "genbank"):
      cdsnum=0
      for feat in record.features:
               prot=record.seq
               analysed_seq = ProteinAnalysis(str(prot)) #creating another class ProteinAnalysis
               sequence_list.append(analysed_seq.get_amino_acids_percent()) #invoking method on this class, it returns a dictionary, we store it in the list
               names_list.append(str(record.name)+ "_CDS#" + str(cdsnum))                    
               sources_list.append(record.annotations['source'])
               keyw_list.append(record.annotations['keywords'])
               taxo_list.append(record.annotations['taxonomy'])
               desc_list.append(record.description)
               taxid_list.append(record.annotations["organism"])
               cdsnum+=1
    #List of dictionaties to the numpy array
    aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    nseqs = len(sequence_list)
    percents=np.zeros((nseqs,20))
    for i in range(nseqs):
        percdict = sequence_list[i]
        for an in range(20):
             percents[i,an]= percdict[ aas[an] ]
    return percents, names_list, sources_list, desc_list, taxo_list, keyw_list, taxid_list, sequence_list
Ejemplo n.º 22
0
def write_regions(f, fh, saf=False):
    for record in parse(f, "fasta"):
        if saf:
            fh.write("{}\t{}\t{}\t{}\t{}\n".format(record.id, record.id, 0,
                                                   len(record), "+"))
        else:
            fh.write("{}\t{}\t{}\n".format(record.id, 0, len(record)))
Ejemplo n.º 23
0
    def test_lincoln(self):
        """Create a set of plasmids from a Combinatorial assembly."""

        records = []
        test_dir = os.path.join(TEST_DIR, "lincoln")
        for (_, _, filenames) in os.walk(test_dir):
            for file in filenames:
                if not file.endswith(".fa"):
                    continue
                test_file = os.path.join(test_dir, file)
                for record in parse(test_file, "fasta"):
                    records.append(record)

        design = Combinatorial(records)
        protocol = GoldenGate(design, enzymes=[BsaI], separate_reagents=True)
        protocol.run()

        csv_output = protocol.to_csv(
            os.path.join(OUT_DIR, "lincoln.layout.csv"))
        protocol.to_picklists(os.path.join(OUT_DIR, "lincoln.hamilton.csv"),
                              platform="hamilton")

        self.assertTrue(protocol.output)
        self.assertEqual(1, csv_output.count("Plate:1"))
        self.assertIn(",,,,,,,,,,,,,A",
                      csv_output)  # empty wells in plate 1 before plate2
Ejemplo n.º 24
0
    def translateSequence(self, file_handle, stop):
        records = parse(file_handle, "fasta")

        for record in records:
            self.sequence = Seq(str(record.seq), IUPAC.unambiguous_rna)
            self.name = record.name
            print("Name: {}".format(self.name))
            if Alphabet._verify_alphabet(
                    self.sequence) == True and stop == 'y':
                self.translated_seq = self.sequence.translate(to_stop=True)
                print("Sequence: {}".format(self.sequence))
                print("Translated sequence: {}".format(self.translated_seq))
                print(
                    "------------------------------------------------------------"
                )
            elif Alphabet._verify_alphabet(
                    self.sequence) == True and stop == 'n':
                self.translated_seq = self.sequence.translate()
                print("Sequence: {}".format(self.sequence))
                print("Translated sequence: {}".format(self.translated_seq))
                print(
                    "------------------------------------------------------------"
                )
            else:
                print(
                    "This sequence is not a RNA, can't translate that. Load correct sequence."
                )
def combineFastaFiles(inputDirectory, outputDirectory, pattern):
    seqObjects = []

    print('I am inside extract sequences method.')

    for root, subdirs, files in walk(inputDirectory):
        # print('walking in this directory:' + root)
        for fileName in files:
            # print('checking this file:' + fileName)
            fullPath = join(root, fileName)
            relativePath = relpath(fullPath, inputDirectory)
            # print('full path:' + fullPath)
            # print('The relative path:' + relativePath)

            # If filename has the pattern in it

            if (pattern in relativePath):
                print('This is the file we are looking for: ' + relativePath)

                for record in parse(fullPath, "fasta"):
                    record.id = record.id + ' ' + relativePath.replace('/', '_').replace('\\', '_')
                    record.description = ''

                    seqObjects.append(record)

    # Print sequences to file.
    outputFileName = join(outputDirectory, 'CombinedSequences.fasta')
    outputFile = createOutputFile(outputFileName)
    write(seqObjects, outputFile, 'fasta')
    outputFile.close()
Ejemplo n.º 26
0
def get_genomic_ref(fasta: str, outfile: str, gene_dict:dict) -> None:
    """Parses ungapped FASTA file and writes genes+alleles to FASTA file.

    This function uses only the alleles which were selected from the gapped FASTA.
    This ensures that the anchor file and genomic reference file correspond
    to one another.

    Parameters
    ----------
    fasta : str
        Ungapped FASTA file.
    outfile : str
       Path for the FASTA file to be written.
    gene_dict : dict
        Dictionary which has a single key for gene+allele combinations.
        It contains only two alleles for each gene. For each gene+allele,
        it contains the numbers of gaps and type of functionality.

    Returns
    -------
    None
    """

    seqs = []
    headers = []
    for seq in parse(fasta, 'fasta'):
        headers.append(seq.description)
        seqs.append(str(seq.seq))

    with open(outfile, 'w') as f:
        for idx,header in enumerate(headers):
            if header in gene_dict:
                f.write(">"+header+"\n")
                f.write(seqs[idx] + "\n")
Ejemplo n.º 27
0
def auto_detect_read_length(seqfile, file_type):
    """ Find median read length from first 10K reads in seqfile """
    valid_lengths = [
        50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 175, 200, 225, 250,
        300, 350, 400, 450, 500
    ]
    read_lengths = []
    try:
        seq_iterator = parse(open_file(seqfile), file_type)
        for index, record in enumerate(seq_iterator):
            if index == 10000: break
            read_lengths.append(len(record.seq))
    except Exception:
        sys.exit(
            "Could not detect read length of: %s\nThis may be due to an invalid format\nTry specifying it with -l"
            % seqfile)
    median_read_length = int(median(read_lengths))
    if median_read_length < valid_lengths[0]:
        sys.exit(
            "Median read length is %s. Cannot compute AGS using reads shorter than 50 bp."
            % median_read_length)
    for index, read_length in enumerate(valid_lengths):
        if read_length > median_read_length:
            return valid_lengths[index - 1]
    return valid_lengths[-1]
Ejemplo n.º 28
0
def count(foo):
    '''takes a file named foo returns the number lines'''
    f = parse(open(foo,'rU'),'fasta')
    n = 0
    for dummyX in f:
        n += 1
    return n
Ejemplo n.º 29
0
def get_flankdb(flankpath):
    Path(flankpath).mkdir(parents=True, exist_ok=True)

    print("[-] Preparing flanking virulence gene database")

    patric = fetch_url(
        'ftp://ftp.patricbrc.org/specialty_genes/referenceDBs/PATRIC_VF.faa', None, flankpath + '/patric.faa')
    victors = fetch_url(
        'http://www.phidias.us/victors/downloads/gen_downloads_protein.php', None, flankpath + '/victors.faa')
    vfdb = fetch_url(
        'http://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz', None, flankpath + '/vfdb.faa.gz')

    params = {'query': 'siderophore AND '
                       'taxonomy:"Bacteria [2]" AND '
                       'NOT receptor NOT partial NOT fragment', 'format': 'fasta'}
    bgcs = fetch_url('http://www.uniprot.org/uniprot/', params, flankpath + '/bgcs.faa')

    filenames = [patric, victors, vfdb, bgcs]

    db = ''

    for fname in filenames:
        if fname.endswith('.gz'):
            with gopen(fname, 'rt') as infile:
                for line in infile:
                    db += line
        else:
            with open(fname, 'rt') as infile:
                for line in infile:
                    db += line
        remove(fname)

    d1 = db.count('>')
    print(f"[-] {d1} total proteins downloaded")
    accessions = set()
    db2 = ''
    for r in parse(StringIO(db), 'fasta'):
        if r.id not in accessions:
            accessions.add(r.id)
            db2 += r.format('fasta')
    d2 = db2.count('>')
    print(f"[-] Removed {d1 - d2} duplicate accessions")

    fasta_lines = db2.split('>')[0:]  # splits each sequence by header

    def remove_complete_duplicates(fasta_lines):
        print(f"[>] Removing redundancy...  ", end="", flush=True)
        outputlist, setofuniqsequence = [], set()
        for sequence in fasta_lines:
            if sequence not in setofuniqsequence:
                outputlist.append(sequence)
                setofuniqsequence.add(sequence)
        print(f"{len(outputlist)} proteins remaining")
        return outputlist

    with open(flankpath + '/flankdb', 'w')  as flank_file:
        flank_file.write('>'.join(remove_complete_duplicates(fasta_lines)))

    return run_makeblastdb(flankpath + '/flankdb', 'prot', flankpath + '/flankdb')
Ejemplo n.º 30
0
def main():
    signal(SIGPIPE,SIG_DFL)
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    for trans_rec in translate_recs(parse(args.in_handle, args.fmt_infile), code=args.code):
        write(trans_rec, args.out_handle, args.fmt_outfile)
Ejemplo n.º 31
0
def import_primer_rev(index):
   # Import primers       
    local_path = pathlib.Path(__file__).parent.absolute()
    primer_list = list(parse(str(local_path) + '/reverse_finalprimers.fasta','fasta'))
    
    # Extract primers that is added
    primer = str(primer_list[index].seq)
    return primer
Ejemplo n.º 32
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    for rec in rename_recs(parse(args.in_handle, args.fmt_infile),
                           get_map(args.map_handle)):
        logger.debug("Writing {}".format(rec.id))
        write(rec, args.out_handle, args.fmt_outfile)
Ejemplo n.º 33
0
def convertWithCtable(file_, ctable, out):
    """ Convert the names of a fasta using a conversion table"""
    from Bio.SeqIO import parse, write
    d = {k: v for i in open(ctable) for k, v in i.strip().split()}
    sequences, renamed = [f for f in parse(file_, 'fasta')], []
    for s in sequences:
        s.id = d.get(s.id)
    write(sequences, out, 'fasta')
    return
Ejemplo n.º 34
0
def truncate_seqs(f, n):
    '''
    truncate FASTA seqs truncates sequences in a FASTA file named f to at most
    n bases, writing a new FASTA file named f.n.fa
    '''
    seq_recs = parse(open(f,'rU'),'fasta')
    # USE FILENAME CORRECTION SCHEME
    foo = f+str(n)+'.fa'
    writer(foo, (rec[0:n] for rec in seq_recs))
Ejemplo n.º 35
0
def mature_accession_to_name(mature_accessions, mature_fa=None):
    """Convert mature miRNA accessions to their respective names"""
    if mature_fa is None:
        mature_fa = _retrieve_mature()
    mton = {
        record.description.split(' ')[1]: record.id
        for record in parse(mature_fa, 'fasta')
    }
    return mature_accessions, mton
Ejemplo n.º 36
0
def remove_alignment_gap(a):
    if a.ofile:
        out = a.ofile
    else:
        out = a.ifile.split('.')[0] + '_nogap.fa'
    with open(out, 'w') as f:
        for rec in parse(a.ifile, 'fasta'):
            f.write('>' + rec.id + '\n')
            f.write(str(rec.seq).replace('-', '') + '\n')
Ejemplo n.º 37
0
def initialize_graph(genome):
	import networkx as nx
	from Bio.SeqIO import parse
	G=nx.Graph()
	contigs=[r for r in parse(genome,'fasta')]
	for c in contigs:
		id_,length=c.id,len(c.seq)
		G.add_node(id_,length=length)
	return G
Ejemplo n.º 38
0
def get_seq_from_files(filename):
    ext = filename.split('.')[-1]
    table = {'fasta':'fasta', 'gbk':'genbank'}
    fmt = table.get(ext, 'fasta')
    handle = open(filename)
    for r in parse(handle, fmt):
        print r.id
        print r.seq
    handle.close()
Ejemplo n.º 39
0
def initialize_graph(genome):
    import networkx as nx
    from Bio.SeqIO import parse
    G = nx.Graph()
    contigs = parse(genome, 'fasta')
    for c in contigs:
        id_, length = c.id, len(c.seq)
        G.add_node(id_, length=length)
    return G
Ejemplo n.º 40
0
def mature_name_to_accession(mature_names, mature_fa=None):
    """Convert a miRNA names to their respective accessions"""
    if mature_fa is None:
        mature_fa = _retrieve_mature()
    ntom = {
        record.id: record.description.split(' ')[1]
        for record in parse(mature_fa, 'fasta')
    }
    return mature_names, ntom
Ejemplo n.º 41
0
def store_lengths(f, minlen=False):
    r = {}
    for record in parse(f, "fasta"):
        if minlen:
            if len(record.seq) < minlen:
                continue
        r[record.id] = len(record.seq)
    df = pd.DataFrame(r,index=["length"]).T
    return df
Ejemplo n.º 42
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    recs = list(parse(args.in_handle, args.fmt_infile))
    assert len(recs) > 0

    for rec in rm_recs(recs,
                       get_list(args.list_handle)):
        write(rec, args.out_handle, args.fmt_outfile)
Ejemplo n.º 43
0
def splitfasta(infile, wrap=False):
    for seq in parse(infile, 'fasta'):
        outfile = str(seq.id)
        for p in string.punctuation:
            outfile = outfile.replace(p, '_')
        outfile = outfile + '.fasta'
        with open(outfile, 'w') as fh:
            fasta_out = FastaIO.FastaWriter(fh, wrap=wrap)
            fasta_out.write_header() # Does nothing, but required
            fasta_out.write_record(seq)
            fasta_out.write_footer() # Does nothing, but required
Ejemplo n.º 44
0
def random_seq(foo, n):
    '''takes a file foo and returns n random sequences from it'''
    max_n = count(foo)
    record_numbers = itertools.repeat(random.randint(1,max_n),times=n)
    seq_recs = parse(open(foo,'rU'),'fasta')
    i = 0
    seqs = []
    for rec in seq_recs:
        i += 1
        if i in record_numbers: seqs.append(rec)
    return seqs
Ejemplo n.º 45
0
def main():
    with open(sys.argv[1]) as names_handle:
        remove_names = set(line.strip() for line in names_handle)

    out_recs = []
    for rec in parse(sys.stdin, 'fasta'):
        if rec.name in remove_names:
            continue
        else:
            out_recs.append(rec)

    write(out_recs, sys.stdout, 'fasta')
Ejemplo n.º 46
0
def permute_fasta(f):
    '''
    takes a FASTA file and returns a new FASTA file with each sequence randomly
    permuted (separately, such that its % A,T,G,C doesn't change)
    '''
    mute = Bio.Seq.Seq.tomutable
    shuffle = random.shuffle
    with open(f + '_permuted.fa', 'w') as output:
        with open(f, 'rU') as fobj:
            for seq_rec in parse(fobj, 'fasta'):
                seq_rec.seq = mute(seq_rec.seq)
                shuffle(seq_rec.seq)
                write(seq_rec, output, 'fasta')
Ejemplo n.º 47
0
def auto_detect_fastq_format(seqfile):
	""" Use first 50,000 reads to detect quality encoding """
	max_reads = 50000
	formats = ['fastq-illumina', 'fastq-solexa', 'fastq-sanger']
	for format in formats:
		try:
			index = 0
			seq_iterator = parse(open_file(seqfile), format)
			for rec in seq_iterator:
				if index == max_reads: break
				index += 1
			return format
		except Exception:
			pass
	sys.exit("Could not detect quality score encoding of: %s\nThis may be due to an invalid format\nTry specifying it with -c" % seqfile)
Ejemplo n.º 48
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    all_recs = OrderedDict()
    for rec in parse(args.in_handle, args.fmt_infile):
        all_recs[rec] = len(rec)

    mode = Counter(all_recs.values()).most_common()[0][0]
    for rec in all_recs:
        if all_recs[rec] == mode:
            write(rec, args.out_handle, args.fmt_outfile)
        else:
            warn(cli.DropSequenceWarning(
                "{} had length {}, not {}".format(rec.id, len(rec), mode)))
Ejemplo n.º 49
0
    def FastaToFDB(self, fastafile):
        fdb_registers = []
        content = open(fastafile)

        sequences = parse(content, 'fasta')

        for sequence in sequences:
            fdb_register = FDBRegister()
            fdb_register.filename = fastafile
            fdb_register.description = sequence.id
            fdb_register.gene = str(sequence.seq)

            fdb_registers.append(fdb_register)

        content.close()

        return self.mount_fdb_file(fdb_registers)
Ejemplo n.º 50
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    for rec_in in parse(args.in_handle, 'fastq'):
        logger.debug(rec_in)
        rec_out = quality_trim(rec_in, args.quality_threshold,
                               keep_columns=args.keep_columns)
        length = len(rec_out.seq)
        if length < args.min_length:
            warn(("Length of sequence {} less than threshold. "
                  "{} < {}. Dropping.").\
                     format(rec_out.id, length, args.min_length),
                 cli.DropSequenceWarning)
        else:
            write(rec_out, args.out_handle, args.fmt_outfile)
Ejemplo n.º 51
0
def auto_detect_read_length(seqfile, file_type):
	""" Find median read length from first 10K reads in seqfile """
	valid_lengths = [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 175, 200, 225, 250, 300, 350, 400, 450, 500]
	read_lengths = []
	try:
		seq_iterator = parse(open_file(seqfile), file_type)
		for index, record in enumerate(seq_iterator):
			if index == 10000: break
			read_lengths.append(len(record.seq))
	except Exception:
		sys.exit("Could not detect read length of: %s\nThis may be due to an invalid format\nTry specifying it with -l" % seqfile)
	median_read_length = int(median(read_lengths))
	if median_read_length < valid_lengths[0]:
		sys.exit("Median read length is %s. Cannot compute AGS using reads shorter than 50 bp." % median_read_length)
	for index, read_length in enumerate(valid_lengths):
		if read_length > median_read_length:
			return valid_lengths[index-1]
	return valid_lengths[-1]
Ejemplo n.º 52
0
def fetch_names(id_list):
    organism_names = {}

    # Doing 100 by 100 to make sure requests to NCBI are not too big
    for i in range(0, len(id_list), 100):
        j = i + 100
        if (j >= len(id_list)):
            j = len(id_list)

        sys.stderr.write("Fetching entries from %s to %s from GenBank\n" % (i, j))
        sys.stderr.flush()
        result_handle = Entrez.efetch(db=db, rettype="gb", id=id_list[i:j])

        # Populate result per organism name
        for record in parse(result_handle, 'genbank'):
            # Using NCBI name, which should match accession number passed
            organism_names[record.name] = record.annotations['organism']

    return organism_names
Ejemplo n.º 53
0
    def FastaToFDB(self, fastafile, username):
        fdb_registers = []
        content = open(fastafile, "r")

        sequences = parse(content, 'fasta')

        for sequence in sequences:
            fdb_register = FDBRegister()
            fdb_register.description = sequence.description
            fdb_register.gene = str(sequence.seq)
            fdb_register.geneinfo = sequence.annotations
            fdb_register.filename = fastafile
            fdb_register.date = date.today()
            fdb_register.user = username

            fdb_registers.append(fdb_register)

        content.close()

        return self.mount_fdb_file(fdb_registers)
Ejemplo n.º 54
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    if args.match_order and args.excluding:
        raise ValueError("--match-order and --excluding cannot both be set.")

    to_fetch = [line.strip() for line in args.list_handle]
    fetch_set = set(to_fetch)

    rec_iter = parse(args.in_handle, args.fmt_infile)

    if args.excluding:
        out_iter = exclude_iter(rec_iter, fetch_set)
    elif args.match_order:
        out_iter = order_iter(fetch_iter(rec_iter, fetch_set), to_fetch)
    else:
        out_iter = fetch_iter(rec_iter, fetch_set)

    write(out_iter, sys.stdout, args.fmt_outfile)
Ejemplo n.º 55
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    hits = read_table(args.table_handle)
    hits['mis_sum'] = hits.mis_start + hits.mis_stop
    if args.max_mismatch:
        hits = hits[hits.mis_sum <= args.max_mismatch]
    if args.primer_set:
        hits = hits[hits.primer_set == args.primer_set]

    recs = parse(args.in_handle, args.fmt_infile)
    for rec in recs:
        amplicon, hit_info = get_amplicon(rec, hits,
                                          trim_primers=args.trim_primers)
        logger.debug(hit_info)
        if (type(hit_info) == type(None)) and args.drop:
            warn(cli.DropSequenceWarning("No hit found for {rec.id}".format(rec=rec)))
        else:
            write(amplicon, args.out_handle, args.fmt_outfile)
Ejemplo n.º 56
0
def process_seqfile(args, paths):
	""" Sample high quality reads from seqfile """
	if args['verbose']:
		print ("====Estimating Average Genome Size====")
		print ("Sampling & trimming reads...")
	outfile = open(paths['tempfile'], 'w')
	# loop over sequences
	read_id, dups, too_short, low_qual = 0, 0, 0, 0
	seqs = set([])
	for seqfile in args['seqfiles']:
		i = 0
		try:
			seq_iterator = parse(open_file(seqfile), args['fastq_format'] if args['file_type'] == 'fastq' else 'fasta')
			for rec in seq_iterator:
				i += 1
				# record sequence if enough high quality bases remain
				if len(rec.seq) < args['read_length']:
					too_short += 1; continue
				# check if sequence is a duplicate
				elif args['filter_dups'] and (str(rec.seq) in seqs or str(rec.seq.reverse_complement()) in seqs):
					dups += 1; continue
				# check if sequence is low quality
				elif quality_filter(rec, args):
					low_qual += 1; continue
				# keep seq
				else:
					outfile.write('>'+str(read_id)+'\n'+str(rec.seq[0:args['read_length']])+'\n')
					read_id += 1
					if args['filter_dups']: seqs.add(str(rec.seq))
					if read_id == args['nreads']: break
			if read_id == args['nreads']: break
		except Exception, e:
			error = "\nAn error was encountered when parsing sequence #%s in the input file: %s\n" % (i+1, seqfile)
			error += "Make sure that the sequence and quality headers match for each sequence (- the 1st character)\n"
			error += "See: https://en.wikipedia.org/wiki/FASTQ_format"
			clean_up(paths)
			sys.exit(error)
Ejemplo n.º 57
0
from scipy.spatial.distance import squareform
from functions import *
from scoring import *
from Bio.SeqIO import parse

handle = open("unknown-proteobacteriae-pubmed-8422969.fasta", "r")
sequences = map(lambda x: str(x.seq), list(parse(handle, "fasta")))

# copy the sequences array
seqs_tmp = [seq for seq in sequences]
similarity_vector = []

# get all pairs of sequences and get the distances
while len(seqs_tmp) > 0:
    v1 = seqs_tmp.pop(0)
    for v2 in seqs_tmp:
        a1, a2 = pairwise_alignment(v1, v2, -2, exact_match)
        similarity_vector.append(pairwise_distance(a1, a2))

distance_matrix = 1 - np.array(similarity_vector)
distance_matrix = squareform(distance_matrix)
# print(np.round(distance_matrix, decimals=2))
t = build_guiding_tree(distance_matrix, sequences)
# print(t)

msa_wrapper(t, exact_match)
msa_wrapper(t, average_match)
Ejemplo n.º 58
0
from Bio.SeqIO import parse
from Bio import pairwise2
from Bio.Align.Applications import MuscleCommandline

# all CDS of mercurialis 
# /scratch/cluster/monthly/gcossard/Hydrexpr_Kallisto/Hydrexpr_reads/CDS_listofbams2.txt.fas

# /scratch/cluster/monthly/gcossard/SNPcall_EXPR/AllCDSMannua_v_AllCDSRicinus.txt
# sortie de BLAST: /scratch/cluster/monthly/gcossard/SNPcall_EXPR/ALLCDS_v14.fasta sur /scratch/cluster/monthly/gcossard/Ricinus_data/TIGR_castorWGS_release_0.1.cds.fsa

blastFile = "/scratch/cluster/monthly/gcossard/SNPcall_EXPR/AllCDSMannua_v_AllCDSRicinus.txt"
ricinusFile = "/scratch/cluster/monthly/gcossard/Ricinus_data/TIGR_castorWGS_release_0.1.cds.fsa"
mercuFile = "/scratch/cluster/monthly/croux/guillaume/ALLCDS_v14.fasta"

ricinus = {}
infile = parse(ricinusFile, "fasta")
for i in infile:
	ricinus[i.id] = i.seq
infile.close()

mercu = {}
infile = parse(mercuFile, "fasta")
for i in infile:
	gene = i.id
	if gene not in mercu:
		mercu[gene] = {}
	mercu[gene] = i.seq
infile.close()

blast = {}
infile = open(blastFile, "r")
Ejemplo n.º 59
0
#!/software/bin/python2.7
from Bio.SeqIO import parse

infile = "/scratch/cluster/monthly/croux/mercurialis/Bams/reads2snps/diploids/orf_fastas/consensus_annua_orf_geneCapture.fas"

input = parse(infile, "fasta")

cnt1 = 0
cnt2 = 0
res = ""
for i in input:
	cnt1 += 1
	res += ">{0}\n{1}\n".format(i.id, i.seq)
	if cnt1%100 == 0:
		cnt2 += 1
		output = open("input_blast_{0}.fas".format(str(cnt2)), "w")
		output.write(res)
		output.close()
		res = ""
		cnt1 = 0
cnt2 += 1
output = open("input_blast_{0}.fas".format(str(cnt2)), "w")
output.write(res)
output.close()

Ejemplo n.º 60
0
def reader(foo):
    '''
    generator yielding Bio.Seq.Seq objects from a FASTA file
    '''
    for record in parse(open(foo, 'rU'), 'fasta'):
        yield record.seq