Esempio n. 1
0
def standardCassette(PromoterName, TerminatorName, orfName, orfSeq):

    #first, the promoter
    print(
        "I'm going to build a standard cassette in which promoter is 600nt, terminator 250nt."
    )
    print("First, which PROMOTER do you want to use, e.g., TDH3")

    PromoterGeneRec = fetchGene(PromoterName)
    PromoterRec = fetchNeighbor(PromoterGeneRec, "upstream", 600)
    PromoterRec.id = PromoterRec.id + "ps"

    #second, the terminator
    print("Which TERMINATOR do you want to use, e.g., ADH1")
    TerminatorGeneRec = fetchGene(TerminatorName)
    TerminatorRec = fetchNeighbor(TerminatorGeneRec, "downstream", 250)
    TerminatorRec.id = TerminatorRec.id + "ts"

    #and last, the gene
    print("What is the name of your gene, e.g., KlGapDH")

    print("What's the sequence")

    orfRecord = SeqRecord(Seq(orfSeq, SingleLetterAlphabet()), id=orfName)

    insertRec = [PromoterRec, orfRecord, TerminatorRec]
    return PromoterRec, orfRecord, TerminatorRec
Esempio n. 2
0
def editExisting(name,
                 option,
                 promoter=None,
                 terminator=None,
                 NewGeneName="",
                 NewGeneSeq=""):
    OrigGeneRecord = fetchGene(name)
    UpHomRec = fetchNeighbor(OrigGeneRecord, "upstream", HomologyLength)
    DownHomRec = fetchNeighbor(OrigGeneRecord, "downstream", HomologyLength)

    if option == 1:
        fragments = [UpHomRec, DownHomRec]

    elif option == 2:
        InsertRec = SeqRecord(Seq(NewGeneSeq, SingleLetterAlphabet()),
                              id=NewGeneName)
        fragments = [UpHomRec, InsertRec, DownHomRec]
    elif option == 3:
        PromoterRec, orfRecord, TerminatorRec = standardCassette(
            promoter, terminator, NewGeneName, NewGeneSeq)
        fragments = [
            UpHomRec, PromoterRec, orfRecord, TerminatorRec, DownHomRec
        ]
    elif option == 4:
        pass
    elif option == 5:
        pass

    return stitch(fragments)
Esempio n. 3
0
def create_new_sequence_record(seq_record):
    # get new sequence values
    new_id = mod_read_id(seq_record)
    new_seq = reduce_seq_length(seq_record)

    if new_seq is None:
        print("Warning: Sequence less than 30 characters")
        return None

    phred_quality = list(seq_record.letter_annotations["phred_quality"])
    phred_quality = phred_quality[0:30]

    # Phred score less than 30
    for score in phred_quality:
        if score < 30:
            # print("Warning: PHRED scroe lower than 30 - discarding - ", score)
            return None

    # construct a new sequence record
    new_seq_record = SeqRecord(Seq(str(new_seq), SingleLetterAlphabet()),
                               id=new_id,
                               name=seq_record.name,
                               description=seq_record.description,
                               dbxrefs=seq_record.dbxrefs,
                               features=seq_record.features,
                               annotations=seq_record.annotations)

    new_seq_record.letter_annotations["phred_quality"] = phred_quality
    return new_seq_record
Esempio n. 4
0
 def _clean_DNA_seq(self, record):
     '''
     Exchange all X in sequence with N
     :param record: Biopython SeqRecord object
     :return: Biopython seq object with Ns instead of Xs
     '''
     return Seq.Seq(re.sub('[^GATC]', 'N', str(record.seq).upper()), SingleLetterAlphabet())
def make_fpa_fasta(genenamestr, genenamelist):
    PSDO = []
    FUNC = []
    if genenamelist:
        for allele in genenamelist:
            if any(c in allele.description
                   for c in ('|F|', '|(F)|', '|[F]|', '|ORF|', '|(ORF)|',
                             '|[ORF]|')):
                FUNC.append(allele)
            if any(c in allele.description for c in ('|P|', '|(P)|', '|[P]|')):
                PSDO.append(allele)
        if "TRBC" in genenamestr:
            cdict = dict()
            for exons in FUNC:
                all_name = exons.id[exons.id.find("|") +
                                    1:nth_occur(exons.id, "|", 2)]
                rdict(all_name, exons, cdict)
            FUNC = []
            exn_order = dict()
            for n in xrange(len(cdict.values())):
                for seqs in cdict.values(
                )[n]:  #arrange exons according to their order
                    i = int(
                        seqs.description[nth_occur(seqs.description, "|", 5) -
                                         1])
                    exn_order[i] = seqs
                    #make joined sequence and new description
                    ntseq = []
                    ntpos = []
                    ntnt = []
                    for j in sorted(exn_order.keys()):
                        ntseq.append(str(exn_order[j].seq))
                        ntpos.append(exn_order[j].description[
                            nth_occur(exn_order[j].description, "|", 5) +
                            1:nth_occur(exn_order[j].description, "|", 6)])
                        ntnt.append(exn_order[j].description[
                            nth_occur(exn_order[j].description, "|", 6) +
                            1:nth_occur(exn_order[j].description, "|", 7) - 2])

                    seq = "".join(ntseq)[1:]
                    position = ";".join(ntpos)
                    region = "C-REGION"
                    nt = "+".join(ntnt).replace(" ", "")

                fle = SeqRecord(
                    Seq(seq, SingleLetterAlphabet()),
                    id=cdict.values()[n][0].id,
                    name=cdict.values()[n][0].name,
                    description="|".join([
                        cdict.values()[n][0].description[:nth_occur(
                            cdict.values()[n][0].description, "|", 4)], region,
                        position, nt, " | | | | | | | |"
                    ]),
                    dbxrefs=cdict.values()[n][0].dbxrefs)
                FUNC.append(fle)

        SeqIO.write(FUNC, "extdata/%s_F.fasta" % genenamestr, "fasta")
        SeqIO.write(PSDO, "extdata/%s_P.fasta" % genenamestr, "fasta")
        SeqIO.write(genenamelist, "extdata/%s.fasta" % genenamestr, "fasta")
Esempio n. 6
0
def variableCassette(geneList, seqList, toVary="", variants=[], variantSeq=[]):
    # Store both name and sequence in a SeqRecord
    # Append them to a list
    # Return list as fragments to be stitched
    if toVary != "":
        toVary = int(toVary)
    records = []

    counter = 0
    for gene in geneList:
        name = gene
        sequence = seqList[counter]
        Rec = SeqRecord(Seq(sequence, SingleLetterAlphabet()),
                        id=str(counter + 1))
        Rec.name = name
        records.append(Rec)
        counter += 1

    variantRecords = []
    variantRecords.append(records)

    # Executes if variants is not empty
    counter = 0
    if variants != []:
        for variant in variants:
            name = variant
            sequence = variantSeq[counter]
            Rec = SeqRecord(Seq(sequence, SingleLetterAlphabet()),
                            id=str(counter + 1))
            Rec.name = name
            # Make a copy of the original, switch the fragments and add it to the list.
            # Deep-copy ensures there are no pointer issues
            tempVariant = copy.deepcopy(records)
            tempVariant[toVary - 1] = Rec
            variantRecords.append(copy.deepcopy(tempVariant))
            counter += 1

    # Returns a list of lists of the answers.
    answer = [[stitch(variantRecords[0])]]
    variants = []
    for n in range(len(variantRecords) - 1):
        frags = variantRecords[n + 1][toVary - 2:toVary]
        variantStitch = [stitch(frags)]
        answer.append(variantStitch)

    return answer
Esempio n. 7
0
def concatenate_fasta(args):
    for fasta_file in glob.glob(args.input):
        concat = Seq.Seq("", SingleLetterAlphabet())
        for s in SeqIO.parse(fasta_file, 'fasta'):
            concat += s
        print(fasta_file)
        concat.id = fasta_file
        concat.description = ""
        SeqIO.write(concat, args.output, 'fasta')
Esempio n. 8
0
def _cleanAli2(recordNuc, omit, fileName, stage):
    handleP = open('tAligned.fas', 'rU')
    records = list(SeqIO.parse(handleP, 'fasta'))

    store = list()

    for rec in records:
        if "gi|" in rec.id or "Homo_sapiens" in rec.id:
            n_count_s = rec.seq[:3].count("N")
            n_count_e = rec.seq[-3:].count("N")
            break

    #print records
    #print recordNuc

    for i, rec in enumerate(records):

        nucData = [x.seq for x in recordNuc if x.id in rec.id]
        nucSeqData = _spliter(nucData[0], 3)

        if stage == "mapper":
            nucSeqData[0] = nucSeqData[0].lstrip("N")
            nucSeqData[-1] = nucSeqData[-1].rstrip("N")
        sequence = Seq("", SingleLetterAlphabet())
        pos = 0

        for j, amino in enumerate(rec.seq):
            if amino == '-':
                sequence = sequence + Seq("---", SingleLetterAlphabet())
            elif amino == "Z":
                sequence = sequence + Seq("NNN", SingleLetterAlphabet())
                pos = pos + 1
            else:
                sequence = sequence + nucSeqData[pos]
                pos = pos + 1

        records[i].seq = Seq(str(sequence), SingleLetterAlphabet())

    with open(fileName, 'w') as fp:
        SeqIO.write(records, fp, "fasta")

    os.remove('translated.fas')
    os.remove('tAligned.fas')
Esempio n. 9
0
def record_from_indices(record, indices):
    '''Given a list of integers and a sequence record, will create a new
    record corresponding to the indices specified.'''
    new_seq = Seq(''.join([record[i] for i in indices]),
                  SingleLetterAlphabet())
    new_record = SeqRecord(new_seq, record.id)
    new_record.description, new_record.name = record.id, record.id
    if record.letter_annotations:
        new_annotations = [
            record.letter_annotations['phred_quality'][i] for i in indices
        ]
        new_record.letter_annotations['phred_quality'] = new_annotations
    return new_record
Esempio n. 10
0
def convert_a2m(ali):
    fh = cStringIO.StringIO(ali)
    msa = AlignIO.read(fh, 'fasta')
    fh.close()
    new_msa = []
    for rec in msa:
        new_seq = Seq(re.sub(r'[a-z.]', '', str(rec.seq)),
                      SingleLetterAlphabet())
        new_rec = rec
        new_rec.seq = new_seq
        new_msa.append(new_rec)
    new_msa = MultipleSeqAlignment(new_msa)
    return new_msa.format('fasta')
Esempio n. 11
0
def add_sequences(input_df, seqrecords):
    """
    It modifies seqrecords by appending the new sequences.
    """
    for row in input_df.itertuples():
        name = '{}:{}'.format(row.Species, row.GeneID)
        seqrecords.append(
            SeqRecord(Seq(row.NucleotideSequence, SingleLetterAlphabet()),
                      id=name,
                      name=name,
                      description='{}:{} {} na:na:na:{}:{}:{}'.format(
                          row.Species, row.GeneID, row.ExonID,
                          row.ExonRegionStart, row.ExonRegionEnd, row.Strand)))
Esempio n. 12
0
def _cleanAli(recordNuc, omit, fileName):
    handleP = open('tAligned.fas', 'rU')
    records = list(SeqIO.parse(handleP, 'fasta'))

    store = list()
    for i, rec in enumerate(records):
        nucData = [x.seq for x in recordNuc if x.id in rec.id]
        nucSeqData = _spliter(nucData[0], 3)
        sequence = Seq("", SingleLetterAlphabet())
        pos = 0

        #print len([x for x in rec.seq if x!="-"]), len(nucSeqData)
        for j, amino in enumerate(rec.seq):
            if amino == '-':
                sequence = sequence + Seq("---", SingleLetterAlphabet())
            elif amino == "Z":
                sequence = sequence + Seq("NNN", SingleLetterAlphabet())
                pos = pos + 1
            else:
                if pos == 0 or pos == len(nucSeqData) - 1:
                    sequence = sequence + nucSeqData[pos].strip("N")
                else:
                    sequence = sequence + nucSeqData[pos]

                pos = pos + 1

        records[i].seq = Seq(str(sequence).strip("N"), SingleLetterAlphabet())

    optimal_length = manage_seqLength([len(rec.seq) for rec in records])

    for i, rec in enumerate(records):
        rec.seq = rec.seq[:optimal_length]

    with open(fileName, 'w') as fp:
        SeqIO.write(records, fp, "fasta")

    os.remove('translated.fas')
    os.remove('tAligned.fas')
Esempio n. 13
0
def editEmpty(name, sequence, cutname, promoter=None, terminator=None):
    df = pd.read_excel(os.path.join(PROJECT_ROOT, "cutsites.xlsx"))

    labels = df['name'].values
    ChrLetters = df['chrom. loc.'].values
    ExpValues = df['exp. lev.'].values
    cutSeqs = df['sequence'].values

    cutArray = {
        'name': Series(labels, index=labels),
        'exp. lev.': Series(ExpValues, index=labels),
        'chrom. loc.': Series(ChrLetters, index=labels),
        'sequence': Series(cutSeqs, index=labels)
    }

    cutFrame = DataFrame(cutArray)
    location = cutFrame.loc[cutname, 'chrom. loc.'] + ".fasta"
    cutSequence = cutFrame.loc[cutname, 'sequence']

    ChromosomeSeq = SeqIO.read(
        os.path.join(PROJECT_ROOT, "chromosomes\\" + location), "fasta").seq

    if ChromosomeSeq.find(cutSequence) == -1:
        ChromosomeSeq = ChromosomeSeq.reverse_complement()

    StartIndex = ChromosomeSeq.find(cutSequence)
    EndIndex = StartIndex + 34

    UpSeq = ChromosomeSeq[StartIndex - HomologyLength:StartIndex]
    DownSeq = ChromosomeSeq[EndIndex:EndIndex + HomologyLength]

    UpHomRec = SeqRecord(UpSeq, id=cutname)
    DownHomRec = SeqRecord(DownSeq, id=cutname)

    orfRecord = SeqRecord(Seq(sequence, SingleLetterAlphabet()), id=name)
    if promoter is None:
        fragments = [UpHomRec, orfRecord, DownHomRec]
    else:
        PromoterGeneRec = fetchGene(promoter)
        PromoterRec = fetchNeighbor(PromoterGeneRec, "upstream", 600)
        PromoterRec.id = PromoterRec.id + "ps"

        TerminatorGeneRec = fetchGene(promoter)
        TerminatorRec = fetchNeighbor(TerminatorGeneRec, "upstream", 600)
        TerminatorRec.id = TerminatorRec.id + "ts"

        fragments = [
            UpHomRec, PromoterRec, orfRecord, TerminatorRec, DownHomRec
        ]
    return stitch(fragments)
Esempio n. 14
0
    def test_genemap_fasta_gene1(self):
        record_1 = SeqRecord(seq=Seq('AAAA', SingleLetterAlphabet()),
                             id='sample1__Brandomstuff__gene1__1',
                             name='sample1__Brandomstuff__gene1__1',
                             description='sample1__Brandomstuff__gene1__1',
                             dbxrefs=[])
        record_2 = SeqRecord(seq=Seq('CCCC', SingleLetterAlphabet()),
                             id='sample2__Arandomstuff__gene1__1',
                             name='sample2__Arandomstuff__gene1__1',
                             description='sample2__Arandomstuff__gene1__1',
                             dbxrefs=[])
        expected = {'gene1': [record_1, record_2]}
        result = dict(sequence_split.make_genemap(self.gene1, '__', 2,
                                                  'fasta'))
        ''' BioSeq.Seq objects don't have proper __eq__ comparison implemented, so have to comapre __dict__ '''
        def seqs_equal(seq1, seq2):
            return str(seq1.seq) == str(
                seq2.seq
            ) and seq1.id == seq2.id and seq1.name == seq2.name and seq1.description == seq2.description

        for key in expected.keys():
            self.assertTrue(key in result.keys())
            self.assertTrue(seqs_equal(expected[key][0], result[key][0]))
Esempio n. 15
0
def capitalize_seqs(input_fasta, output_fasta, filetype='fasta'):
    """Capitalizes the ATGC sequence in a fasta file and writes it to a new file.

    :param input_fasta: Filepath to the input fasta file to capitalize.
    :param output_fasta: Filepath to the output fasta file.
    :param filetype: The file format to read and write.  Either 'fasta' or 'fastq'
    :return: Filepath to the output fasta file.
    """
    capitalized_output_file = BufferedSeqWriter(output_fasta, filetype)

    for sequence in SeqIO.parse(open(input_fasta, 'rU'), "fasta"):
        sequence.seq = Seq(str(sequence.seq).upper(), SingleLetterAlphabet())
        capitalized_output_file.write(sequence)
    capitalized_output_file.flush()
    return output_fasta
Esempio n. 16
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])

    for infile in args.infiles:
        alignments = AlignIO.read(infile,
                                  format="fasta",
                                  alphabet=Gapped(SingleLetterAlphabet(), "-"))

        id_ = os.path.split(os.path.splitext(infile.name)[0])[-1]

        fmt = alignments.format("stockholm").split("\n", maxsplit=1)
        args.outfile.write(fmt[0])
        args.outfile.write(f"\n#=GF ID {id_}\n")
        args.outfile.write(fmt[1])
    return
Esempio n. 17
0
def clusters_alignment(file):
    full_alignment = starting_pt(file)
    clusters = hawk_wrap(file)
    consensus_list = []
    cluster_index = []
    for x in range(len(clusters)):
        if len(clusters[x]) > 1:
            cluster_index.append(x)
    while clusters:
        current_cluster = clusters.pop(0)
        if len(current_cluster) > 1:
            multiple_seqs = [full_alignment[x] for x in current_cluster]
            aligned_multiples = alignment_wrap(multiple_seqs)
            subprocess.run([
                "em_cons",
                "/home/god/Documents/oldHawkEye/very_unlikely_to_be_called_this.fasta",
                "/home/god/Documents/oldHawkEye/very_unlikely_consensus.cons"
            ])
            with open(
                    "/home/god/Documents/oldHawkEye/very_unlikely_consensus.cons"
            ) as consensus:
                seq = consensus.read()
                con = []
                dash = 'n'
                R_DNA = ['A', 'C', 'T', 'G', 'U']
                for i in seq.split():
                    for j in i:
                        if j in R_DNA:
                            con.append(j)
                        elif j not in R_DNA:
                            pass
                con_str = ''.join(str(k) for k in con)
            consensus.close()
            simple_seq_r = SeqRecord(Seq(con_str, SingleLetterAlphabet()),
                                     id="CLUSTER" + str(cluster_index.pop(0)))
            consensus_list.append(simple_seq_r)
        elif len(current_cluster) == 1:
            single_seq = [full_alignment[x] for x in current_cluster]
            consensus_list.append(single_seq.pop(0))
    SeqIO.write(consensus_list, "very_unlikely_to_be_called_this.fasta",
                "fasta")
    file = "very_unlikely_to_be_called_this.fasta"
    in_file = "/home/god/Documents/oldHawkEye/" + file
    mafft_cline = MafftCommandline(input=in_file)
    stdout, stderr = mafft_cline()
    handle = open(file, "w")
    handle.write(stdout)
    handle.close()
Esempio n. 18
0
def mask_seq(seq, start, end, id, length):
    '''Replaces the coords between start and end with N'''
    masked = []
    insert = Seq("N" * length, SingleLetterAlphabet())
    # Find contig
    for seq_record in seq:
        if seq_record.id == id:
            newseq = seq_record.seq[:start - 1] + insert + seq_record.seq[end:]
            #seq_record.seq = newseq
            masked.append(
                SeqRecord(id=seq_record.id,
                          description=seq_record.description,
                          seq=newseq))
        else:
            masked.append(seq_record)
    return masked
Esempio n. 19
0
def convert_sequence_file_format(input_filepath,
                                 input_format,
                                 output_format,
                                 output_filename=None):
    """
    Converts an sequence file specified in the 'input_format' argument in an alignment file
    in the format specified in the 'output_format'.
    """
    input_file_basename = os.path.basename(input_filepath)
    input_file_name = os.path.splitext(input_file_basename)[0]

    if not output_filename:
        output_file_basename = "%s.%s" % (
            input_file_name,
            pymod_vars.alignment_extensions_dictionary[output_format])
    else:
        output_file_basename = "%s.%s" % (
            output_filename,
            pymod_vars.alignment_extensions_dictionary[output_format])
    output_file_handler = open(
        os.path.join(os.path.dirname(input_filepath), output_file_basename),
        "w")

    if input_format == "pymod":
        input_file_handler = open(input_filepath, "r")
        records = [
            SeqRecord(Seq(l.split(" ")[1].rstrip("\n\r")), id=l.split(" ")[0])
            for l in input_file_handler.readlines()
        ]
    else:
        input_file_handler = open(input_filepath, "r")
        records = list(
            SeqIO.parse(input_file_handler,
                        input_format,
                        alphabet=SingleLetterAlphabet()))

    if output_format == "pymod":
        lines = []
        for i in [(rec.id, rec.seq) for rec in records]:
            lines.append(str(i[0]) + '\n')
            lines.append(str(i[1]) + '\n')
        output_file_handler.writelines(lines)
    else:
        SeqIO.write(records, output_file_handler, output_format)

    input_file_handler.close()
    output_file_handler.close()
 def _get_seqlist(self, slist, gdict):
     if len(slist)==0: return Seq('',alphabet=SingleLetterAlphabet())
     if self.strand() == '+':
         iseq = slist[0].sequence(gdict).upper()
         for i,p1 in enumerate(slist[1:]):
             p0 = slist[i]
             assert not p1.start < p0.end, "error: overlapping intervals:\n%s\n%s" % (p0,p1)
             if p1.start > p0.end:
                 iseq += gdict[p1.chrom][p0.end:p1.start].lower()
             iseq += p1.sequence(gdict).upper()
     else:
         iseq = slist[0].sequence(gdict).upper()
         for i,p1 in enumerate(slist[1:]):
             p0 = slist[i]
             assert not p0.start < p1.end, "error: overlapping intervals:\n%s\n%s" % (p0,p1)
             if p0.start > p1.end:
                 iseq += gdict[p1.chrom][p1.end:p0.start].reverse_complement().lower()
             iseq += p1.sequence(gdict).upper()
     return iseq
def filtergen(
    file
):  # generator function that returns edited reads that pass filter, to write new fastq file
    for record in SeqIO.parse(file, "fastq"):
        # Convert base qualities to Boolean based on Qscore threshold value. Only use reads with >=50% non-N:
        recordqual = [
            x > Qscore_threshold
            for x in record.letter_annotations['phred_quality']
        ]  # list of True, False etc
        if float(sum(recordqual)) / float(
                len(recordqual
                    )) >= .5:  # note that True = 1, False = 0 for summing
            # generates new read sequence where all bases < threshold is switched to 'N'
            seq = "".join(
                [y if x else 'N' for (x, y) in zip(recordqual, record.seq)])
            # create new SeqRecord with edited read sequence
            newrec = SeqRecord(Seq(seq, SingleLetterAlphabet()),
                               id=record.id,
                               name=record.name,
                               description=record.description,
                               letter_annotations=record.letter_annotations)
            yield newrec
Esempio n. 22
0
 def __extractReads(self, indexVal, howLong, iterator):
     for record in iterator:
         read = self.__getSeq(record, indexVal, howLong)
         if len(read) > 0:
             readID, contig = re.match("^(\d+)-(contig\d+)$",
                                       record.id).groups()
             r = self.readInfo[readID]
             #>58526338-contig00001-33057/1;  KO:K00927       start: 575      offset: 287
             header = "%s-%s/%s\tKO:%s\tstart:%s\toffset:%s" % (
                 record.id, r['taxa'], r['readnum'], self.ko, indexVal,
                 howLong)
             newseq = Seq(
                 str(record.seq).upper().translate(
                     {ord(i): None
                      for i in '-'}), SingleLetterAlphabet())
             newrecord = SeqRecord(newseq,
                                   id=header,
                                   name="",
                                   description="")
             self.outputRecords.append(newrecord)
             self.readInfo[readID]['readnum'] += 1
     return
def combine_sequence(in_fasta, threshold):
    record_dict = SeqIO.index(in_fasta, "fasta")  # index the record
    # initiate an empty Sequence string
    combined_string = Seq("", SingleLetterAlphabet())
    # fasta_dic
    out_fasta_dic = {}
    # dictionary map index of each contig
    out_map_dic = {}
    start = 0
    count = 0
    for record in sorted(record_dict):
        count += 1
        new_contig = record_dict[record].seq
        if len(new_contig) >= threshold:
            length = len(new_contig)
            combined_string += new_contig
            out_map_dic[record_dict[record].id] = [start, start + length]
            start += length  # increment the start position
        if count == 1:
            out_fasta_dic["id"] = record_dict[record].id
            out_fasta_dic["description"] = record_dict[record].description
    out_fasta_dic["sequence"] = combined_string
    return out_fasta_dic, out_map_dic
def extract_data(rdir, LVEXON):
    #make /extdata directory
    final_directory = makerdir('/extdata')
    if not os.path.exists(final_directory):
        os.makedirs(final_directory)

    #read fasta file
    rawdata = list(SeqIO.parse(rdir, "fasta"))
    h**o = []
    for element in rawdata:
        if 'H**o' in element.id:
            h**o.append(element)
    TRB = []
    for element in h**o:
        if 'TRB' in element.id:
            TRB.append(element)
    TRBV = []
    TRBD = []
    TRBJ = []
    TRBC = []
    TRBL = []

    for element in TRB:
        if 'TRBV' in element.id:
            if 'L-PART1+L-PART2' in element.description:
                TRBL.append(element)
            else:
                TRBV.append(element)
        elif 'TRBD' in element.id:
            TRBD.append(element)
        elif 'TRBJ' in element.id:
            TRBJ.append(element)
        elif 'TRBC' in element.id:
            TRBC.append(element)
    make_fpa_fasta("TRBV", TRBV)
    make_fpa_fasta("TRBD", TRBD)
    make_fpa_fasta("TRBJ", TRBJ)
    make_fpa_fasta("TRBC", TRBC)
    make_fpa_fasta("TRBL", TRBL)

    lead = list(SeqIO.parse(LVEXON, "fasta"))
    TRBV_F = list(SeqIO.parse("extdata/TRBV_F.fasta", "fasta"))
    TRBV_P = list(SeqIO.parse("extdata/TRBV_P.fasta", "fasta"))
    TRBV = TRBV_F + TRBV_P

    matchedV = []
    match = dict()
    TRBLV = []
    TRBLV_F = []
    TRBLV_P = []

    nostartcodon = 0
    yesmatch = 0
    seqbtwLV = 0
    for allele in TRBV:
        for lvsq in lead:
            if allele.id[allele.id.find("TRB"):allele.id.find("*") +
                         3] in lvsq.id:
                yesmatch += 1
                if allele.seq in lvsq.seq[-len(allele.seq):]:
                    if str(lvsq.seq)[:3] == "atg":
                        temp = [
                            str(lvsq.seq[:-len(allele.seq)]),
                            str(allele.seq)
                        ]
                        length = "{0}+{1} nt".format(len(temp[0]),
                                                     len(temp[1]))
                        new_description = allele.description[:nth_occur(
                            allele.description, "|", 6
                        ) + 1] + length + allele.description[
                            nth_occur(allele.description, "|", 7):]
                        TRBLV.append(
                            SeqRecord(Seq(str("".join(temp)),
                                          SingleLetterAlphabet()),
                                      description=new_description,
                                      id=lvsq.id,
                                      name=lvsq.name,
                                      dbxrefs=lvsq.dbxrefs))
                        rdict(
                            allele.id[allele.id.find("TRB"):allele.id.find("*"
                                                                           )],
                            str(lvsq.seq[:-len(allele.seq)]), match)
                        matchedV.append(allele)
                    else:
                        nostartcodon += 1
                else:
                    seqbtwLV += 1
    mismatch = set(TRBV) - set(matchedV)

    #for V alleles with no matching L, use other allele's L sequence
    for allele in mismatch:
        genename = allele.id[allele.id.find("TRB"):allele.id.find("*")]
        for key in match:
            if genename == key:
                temp = [str(most_common(match[key])), str(allele.seq)]
                length = "{0}+{1} nt".format(len(temp[0]), len(temp[1]))
                new_description = allele.description[:nth_occur(
                    allele.description, "|", 6
                ) + 1] + length + allele.description[
                    nth_occur(allele.description, "|", 7):]
                TRBLV.append(
                    SeqRecord(Seq("".join(temp), SingleLetterAlphabet()),
                              description=new_description,
                              id=allele.id,
                              name=allele.name,
                              dbxrefs=allele.dbxrefs))
                matchedV.append(allele)

    mismatch = set(TRBV) - set(matchedV)
    n = []
    for element in mismatch:
        n.append(element.description[:element.description.find("*") + 3])
    make_fpa_fasta("TRBLV", TRBLV)
Esempio n. 25
0
def basicVariantCall( pileUp, ref=None ):
    # This function will serve as a basic variant call method to be used in
    # our single ref. pipelines. It will remove all singleton sites (with
    # no mapping ambiguity).

    from Bio.Seq import Seq
    from Bio import SeqIO
    from Bio.Alphabet import SingleLetterAlphabet
    from itertools import izip
    from scipy.stats import iqr
    # import numpy as np

    mapQC = []
    summaryStats = []
    consensusContigs = []
    variants = []

    for refName,counts,inserts in pileUp:
        outName = refName.split("|")[-1].replace('/','_')
        tmpDict = defaultdict(list)
        tmpDict2 = defaultdict(int)
        
        tmpDict['name'] = outName
        tmpDict['variant'] = [np.nonzero(counts[:,n])[0].tolist() for n in xrange(counts.shape[1])]
        tmpDict['cov'] = [counts[np.nonzero(counts[:,n])[0],n].tolist() for n in xrange(counts.shape[1])]

        siteCov = np.sum(counts,axis=0)
        tmpDict2['avg_cov'] = np.median( siteCov )
        tmpDict2['std_cov'] = (20.0*iqr( siteCov )) / 27.0
        tmpDict2['num_uncov'] = np.sum(siteCov == 0)
        tmpDict2['contig_len'] = len(siteCov) 

        frac = counts[:,siteCov>0]
        frac /= (1.*frac.sum(axis=0,keepdims=True))
        # print np.sum(frac.sum(axis=0,keepdims=True) ==0)
        S = -np.log(frac+.0001) * frac
        tmpDict2['site_entropy'] = np.mean(np.sum(S,axis=0)) 

        serialIndel = inserts
        for outKey in serialIndel.keys():
            for inKey in serialIndel[outKey].keys():
                serialIndel[outKey][inKey] = np.asscalar(serialIndel[outKey][inKey])

        tmpDict['indel'] = serialIndel
        mapQC.append(tmpDict)
        summaryStats.append(tmpDict2)
        # Save consensus sequence (where it differs from reference) as well as locations where variation
        # has been detected.
        consensusSeq = []
        variableSites = defaultdict(lambda: defaultdict(list))
        alphabet = {0: 'A', 1: 'C', 2: 'G', 3: 'T', 4: '-', 5: 'N'}

        for n,seq in enumerate(tmpDict['variant']):
            if (len(seq) > 1):
                variableSites[n]['nuc'] = [alphabet[s] for s in seq]
                variableSites[n]['cov'] = tmpDict['cov'][n] # If there are more than one detected variant, store
                consensusSeq.append(alphabet[seq[np.argmax(tmpDict['cov'][n])]])
            elif (len(seq) == 0):
                consensusSeq.append('-')
            else:
                consensusSeq.append(alphabet[seq[0]])
        consensusContigs.append(Seq(''.join(consensusSeq),SingleLetterAlphabet()))
        variants.append(variableSites)

    # refSeq = []
    # with open(ref,'r') as refFile:
    #     for contig in SeqIO.parse(refFile,'fasta'):
    #         refSeq.append(contig.seq)

    # Store consensus sequence (as a variant of the reference to save space)
    conSeq = defaultdict(str)
    # for C in xrange(len(refSeq)):
    #     for n,(rN,cN) in enumerate(izip(refSeq[C],consensusContigs[C])):
    #         if (cN != 'N' and rN != cN):
    #             conSeq[n] = cN

    return mapQC,summaryStats,variants,conSeq
def guess_alphabet(sequence:str):
    '''
    This function guesses the alphabet of a string representing a 
    biological sequence.

    '''
    
    import string

    from Bio.Alphabet       import SingleLetterAlphabet
    from Bio.Alphabet       import NucleotideAlphabet
    from Bio.Alphabet       import ProteinAlphabet
    from Bio.Alphabet.IUPAC import extended_protein
    from Bio.Alphabet.IUPAC import protein
    from Bio.Alphabet.IUPAC import ambiguous_dna
    from Bio.Alphabet.IUPAC import unambiguous_dna
    from Bio.Alphabet.IUPAC import extended_dna 
    from Bio.Alphabet.IUPAC import ambiguous_rna
    from Bio.Alphabet.IUPAC import unambiguous_rna

    if len(sequence)<1:
        return SingleLetterAlphabet()

    for c in sequence:
        if c not in string.printable:
            return SingleLetterAlphabet()

    xp = set(extended_protein.letters)
    pr = set(protein.letters)

    ad = set(ambiguous_dna.letters)
    ud = set(unambiguous_dna.letters)
    ed = set(extended_dna.letters)

    ar = set(ambiguous_rna.letters)
    ur = set(unambiguous_rna.letters)

    all = xp|pr|ad|ud|ed|ar|ur

    sequence_chars = set(sequence.upper())

    if sequence_chars - all - set(string.punctuation+string.whitespace):
        return SingleLetterAlphabet()

    nucleic_count = 0

    for letter in "GATCUNgatcun":
        nucleic_count += sequence.count(letter)

    if float(nucleic_count) / float(len(sequence)) >= 0.9: # DNA or RNA
        if 'T' in sequence_chars and 'U' in sequence_chars:
            alphabet = NucleotideAlphabet()
        elif not sequence_chars-ud:
            alphabet = unambiguous_dna
        elif not sequence_chars-ad :
            alphabet = ambiguous_dna
        elif not sequence_chars-ed:
            alphabet = extended_dna
        elif not sequence_chars-ur:
            alphabet = unambiguous_rna
        elif not sequence_chars-ar:
            alphabet = ambiguous_rna
        else:
            alphabet = NucleotideAlphabet()
    else:
        threecode = ['ALA', 'ASX', 'CYS', 'ASP','GLU', 'PHE', 'GLY', 'HIS',
                     'ILE', 'LYS', 'LEU', 'MET','ASN', 'PRO', 'GLN', 'ARG',
                     'SER', 'THR', 'VAL', 'TRP','TYR', 'GLX', 'XAA', 'TER',
                     'SEL', 'PYL', 'XLE']
        tc=set(threecode)
        three_letter_alphabet = set( [ sequence[i:i+3] for i in range(0,len(sequence),3)] )
        if not three_letter_alphabet - tc:
            alphabet = "three letter code"
        elif sequence_chars - pr:
            alphabet = protein
        elif sequence_chars - xp:
            alphabet = extended_protein
        else:
            alphabet = ProteinAlphabet()
    return alphabet
Esempio n. 27
0
    if trna_mod_row['Organellum'] != 'cytosolic':
        continue
    key = (trna_mod_row['Amino acid'], trna_mod_row['Anticodon (Canonical)'])
    if key not in trna_mods:
        trna_mods[key] = []
    trna_mods[key].append({
        'id': trna_mod_row['Id'],
        'can': trna_mod_row['Sequence (Canonical)'],
        'nc': trna_mod_row['Sequence'],
    })

trna_seq_rows = pandas.read_excel('examples/homo_sapiens_rna/summary.xlsx',
                                  sheet_name='tRNA seqs - Gogakos et al.',
                                  header=[0, 1])
trna_seqs = []
alphabet = SingleLetterAlphabet()
for _, trna_seq_row in trna_seq_rows.iterrows():
    aa = list(trna_seq_row.items())[0][1]
    anticodon = list(trna_seq_row.items())[1][1]
    key = (aa, anticodon)
    if key not in trna_mods:
        continue

    # find most similar sequences in MODOMICS
    best_id = None
    best_can_seq = None
    best_seq = None
    best_alignment = None
    best_score = -float('inf')
    for trna_mod in trna_mods[key]:
        alignment = pairwise2.align.globalxs(
Esempio n. 28
0
def clusters_alignment(file):
    full_alignment = starting_pt(file)
    clusters = hawk_wrap(file)
    consensus_list = []
    cluster_index = []
    if type(clusters[0]) == list:
        for x in range(len(clusters)):
            if len(clusters[x]) > 1:
                cluster_index.append(x)
    while clusters:
        if type(clusters[0]) == list:
            current_cluster = clusters.pop(0)
            if len(current_cluster) > 1:
                multiple_seqs = [full_alignment[x] for x in current_cluster]
                with tempfile.NamedTemporaryFile() as alignment_file:
                    SeqIO.write(multiple_seqs, alignment_file.name, "fasta")
                    with tempfile.NamedTemporaryFile() as consensus_file:
                        subprocess.call([
                            "em_cons", alignment_file.name, consensus_file.name
                        ])
                        seq = consensus_file.name
                        data = open(seq).read()
                        con = []
                        dash = 'n'
                        R_DNA = [
                            'A', 'a', 'C', 'c', 'T', 't', 'G', 'g', 'U', 'u'
                        ]
                        for i in data.split():
                            for j in i:
                                if j in R_DNA:
                                    con.append(j)
                                elif j not in R_DNA:
                                    pass
                        con_str = ''.join(str(k) for k in con)
                cluster_seq_id = []
                while multiple_seqs:
                    seq_info = multiple_seqs.pop(0)
                    cluster_seq_id.append(seq_info.id)

                seqid_string = '|'.join(str(l) for l in cluster_seq_id)
                simple_seq_r = SeqRecord(Seq(con_str, SingleLetterAlphabet()),
                                         id="CLUSTER_" +
                                         str(cluster_index.pop(0)) + ": " +
                                         seqid_string)
                consensus_list.append(simple_seq_r)
            elif len(current_cluster) == 1:
                single_seq = [full_alignment[x] for x in current_cluster]
                consensus_list.append(single_seq.pop(0))
        elif type(clusters[0]) == int:
            noSaturation = alignment_wrap(full_alignment)
            return noSaturation
    final_aligned_clusters = []
    with tempfile.NamedTemporaryFile() as consensus_alignment:
        SeqIO.write(consensus_list, consensus_alignment.name, "fasta")
        file = consensus_alignment.name
        in_file = consensus_alignment.name
        mafft_cline = MafftCommandline(input=in_file)
        stdout, stderr = mafft_cline()
        handle = open(file, "w")
        handle.write(stdout)
        handle.close()
        path = consensus_alignment.name
        data = open(path).read()
        with tempfile.NamedTemporaryFile() as clusters_file:
            records = (
                rec.upper()
                for rec in SeqIO.parse(consensus_alignment.name, "fasta"))
            SeqIO.write(records, clusters_file.name, "fasta")
            aligned_list = AlignIO.read(open(clusters_file.name), 'fasta')
    return aligned_list
Esempio n. 29
0
def grande_alignment(file):
    raw_seqs = starting_pt(file)
    before_segment = clusters_alignment(file)
    list_of_clusters = hawk_wrap(file)
    print("Clusters formed:")
    print(len(list_of_clusters))
    print(list_of_clusters)
    seqs_in_consensus = []
    allEqualSeqs = []
    position = 0
    if type(list_of_clusters[0]) == list:
        big_final_alignment = []
        while list_of_clusters:
            current_cluster = list_of_clusters.pop(0)
            if len(current_cluster) == 1:
                single_seq = before_segment[position]
                big_final_alignment.append(single_seq)
                position += 1
            elif len(current_cluster) > 1:
                seqs_in_the_cluster = [raw_seqs[x] for x in current_cluster]
                multiple_seq = before_segment[position]
                while seqs_in_the_cluster:
                    seqs_in_consensus.append(seqs_in_the_cluster.pop(0))
                    with tempfile.NamedTemporaryFile() as segment_align:
                        SeqIO.write(seqs_in_consensus, segment_align.name,
                                    "fasta")
                        original_seqs = list(
                            SeqIO.parse(segment_align.name, "fasta"))
                        with tempfile.NamedTemporaryFile() as segmenter:
                            dash = '-'
                            dashes = []
                            consensus = multiple_seq.seq
                            seq = original_seqs
                            for n in range(len(seq)):
                                seq_str = seq[n].seq
                                seq_id = seq[n].id
                                able_to_insert_seq = seq_str.tomutable()
                            for x in consensus:
                                if x == dash:
                                    dashes = [
                                        y for y, x in enumerate(consensus)
                                        if x == dash
                                    ]
                            while dashes:
                                dash_position = dashes.pop(0)
                                able_to_insert_seq.insert(dash_position, '-')
                            new_seq_record = SeqRecord(Seq(
                                str(able_to_insert_seq),
                                SingleLetterAlphabet()),
                                                       id=seq_id)
                            big_final_alignment.append(new_seq_record)
                position += 1
        with tempfile.NamedTemporaryFile() as unequalSeqs:
            dash = '-'
            SeqIO.write(big_final_alignment, unequalSeqs.name, "fasta")
            total = list(SeqIO.parse(unequalSeqs.name, "fasta"))
            largestSeq = len(
                max([total[ind].seq for ind in range(len(total))], key=len))
            while total:
                checkSeq = total.pop(0)
                if len(checkSeq.seq) < largestSeq:
                    smallerLength = len(checkSeq.seq)
                    seqStr = checkSeq.seq
                    seqId = checkSeq.id
                    needsEndingFilled = seqStr.tomutable()
                    endingDashes = list(
                        range(smallerLength + 1, largestSeq + 1))
                    while endingDashes:
                        j = endingDashes.pop(0)
                        needsEndingFilled.insert(j, '-')
                    nowEqual = SeqRecord(Seq(str(needsEndingFilled),
                                             SingleLetterAlphabet()),
                                         id=seqId)
                    allEqualSeqs.append(nowEqual)
                elif len(checkSeq.seq) == largestSeq:
                    allEqualSeqs.append(checkSeq)
    elif type(list_of_clusters[0]) == int:
        allEqualSeqs = before_segment
    for seqs in range(len(allEqualSeqs)):
        print(">" + allEqualSeqs[seqs].id)
        print(allEqualSeqs[seqs].seq)
Esempio n. 30
0
def mask_fna_with_spacers(INTERMEDIATES: str,
                          FNA_FILE: str,
                          SELECT_SPACER_FASTA: str,
                          MASKED_FNA: str = 'masked.fna'):
    from subprocess import call as execute

    MASKED_ORGANISM_DB = INTERMEDIATES + 'masked_db/'  # file to store masked FNA file
    execute(['mkdir', MASKED_ORGANISM_DB])
    MASKED_FNA = MASKED_ORGANISM_DB + MASKED_FNA

    NEIGHBORING_NUCLEOTIDES = 500  # number of BP (+/-) to also mask around the array(s)

    arrayStartEndList = get_start_end_list(
        SELECT_SPACER_FASTA)  # list containing the start and end of all arrays
    sequence_dict = fna_to_dict(
        FNA_FILE
    )  # header along with its start/end and the full sequence of the organism
    crisprName_maps_seqName = dict()
    for key in sequence_dict.keys():
        nc_name = key.split('.')
        if len(nc_name) > 1:
            crisprName_maps_seqName[''.join(nc_name[0:-1])] = key
        else:
            crisprName_maps_seqName[nc_name[0]] = key
    # crisprName_maps_seqName = {''.join(key.split('.')[0:-1]):key for key in sequence_dict.keys()}
    '''
		Masks CRISPR spacers from the organisms FNA file.
	'''
    for arrayStartEnd in arrayStartEndList:
        start = int(arrayStartEnd[0]) - 1
        end = int(arrayStartEnd[1])
        crispr_name = arrayStartEnd[2]  # fetch the nc # of the array
        nc_id = crisprName_maps_seqName[crispr_name]

        start, end = start - NEIGHBORING_NUCLEOTIDES, end + NEIGHBORING_NUCLEOTIDES
        sequence = str(sequence_dict[nc_id].seq
                       )  # obtain the sequence: str from the sequence dict

        if start < 0:  # corrects neg number
            start = 0
        if end > len(
                sequence
        ):  # corrects if end happens to be bigger than the length of the sequence
            end = len(sequence)

        blank = 'N'
        blank = blank * (
            (end - start)
        )  # multiplies masking to cover the array plus the neighboring BP's

        sequence = sequence[0:start] + blank + sequence[end:len(
            sequence)]  # masks sequences
        sequence_dict[nc_id].seq = Seq(sequence, SingleLetterAlphabet())

    from sys import path as sys_path
    sys_path.append('dependencies/PyGornism/')
    # from regex import string_with_limited_width
    '''
		Writes new FNA file with masking to disk.
	'''
    with open(MASKED_FNA, 'w') as handle:
        for seq_record in sequence_dict:
            SeqIO.write(sequence_dict[seq_record], handle, 'fasta')

    return MASKED_FNA, crisprName_maps_seqName