Example #1
0
def find_genes(record, outdir):
	seq = record.seq
	rev_seq = seq.reverse_complement()
	for gene_name in ['pep', 'yej', 'omp', 'rim', 'pdf', 'sbm', 'asp', 'def']:
		genes = []
		for feature in record.features:
			if feature.type == 'gene':
				if 'gene' in feature.qualifiers:
					gene_name_gb = str(feature.qualifiers['gene'])
					if gene_name in gene_name_gb:
						start = feature.location.start
						end = feature.location.end
						if feature.location.strand == 1:
							seq_name = gene_name_gb.translate(None, '!@#$[]')
							gene = SeqIO.SeqRecord(seq[start:end], id = seq_name)
							genes.append(gene)
						elif feature.location.strand == -1:
							seq_name = gene_name_gb	.translate(None, '!@#$[]')
							gene = SeqIO.SeqRecord(rev_seq[start:end], id = seq_name)
							genes.append(gene)
						else: print 'Error'

		file_out = outdir + gene_name + '.fasta'
		genes = [f for f in sorted(genes, key=lambda x : str(x.id))]
		SeqIO.write(genes, file_out, "fasta")
Example #2
0
def read(file):
    lines = open(file).readlines()
    idx = 0
    # get past header
    while (True):
        line = lines[idx]
        idx += 1
        if ("Maximal single base matches" in line):
            break
    aln = ()
    while (idx < len(lines)):
        line = lines[idx].split()
        # if this is true then we are in a alignment section
        #if(len(line) > 0 and line[0].isdigit() and line[1] != "matching" ):
        if (len(line) > 0 and line[0] == "ALIGNMENT"):
            aln = readAln(lines, idx)
            break
        idx += 1
    rec1 = SeqIO.SeqRecord(Seq(aln[1][4]),
                           id=aln[0][4],
                           description=" ".join(aln[0]))
    rec2 = SeqIO.SeqRecord(Seq(aln[1][5]),
                           id=aln[0][9],
                           description=" ".join(aln[0]))
    #SeqIO.write([rec1], "A."+args.out, "fasta")
    #SeqIO.write([rec2], "B."+args.out, "fasta")
    SeqIO.write([rec1, rec2], args.out, "fasta")
Example #3
0
def parseFasta(name, nucleotides=False):
    """
    Function that parses a fasta file into a Samples object.
    :param nucleotides: boolean, indicates if nucleotides or proteins
    :param name: str, filename of the file we wish to parse
    :return: Samples, the samples that are specified in the file
    """
    samples = Samples()
    with open(name) as file:
        for record in SeqIO.parse(file, "fasta"):
            ID = getID(record)
            if nucleotides:
                genomeSequence = SeqIO.SeqRecord(record.seq,
                                                 id=record.id,
                                                 name=record.id,
                                                 description='')
                samples.getSample(ID).addGenome(genomeSequence)
            else:
                proteinName = getProteinName(record)
                proteinSequence = SeqIO.SeqRecord(record.seq,
                                                  id=record.id,
                                                  name=proteinName,
                                                  description='')
                origin = getOrigin(record)
                protein = Protein(proteinName, proteinSequence, origin)
                samples.getSample(ID).addProtein(protein)
    return samples
Example #4
0
    def simulateAmpliconReads(self,sequenceAbundanceCounter, prefix, readSize=76, coverage=10, reverseComplementAmplicon=False):

        sequences = []
        print(( sequenceAbundanceCounter.most_common(1)))
        (v,hCount) = sequenceAbundanceCounter.most_common(1)[0]
        for iteration in range(0,hCount):

            for sequence,count in sequenceAbundanceCounter.most_common():
                if count>=hCount:
                    sequences.append(sequence)
                else:
                    break
            hCount-=1


        bpythonSeqs=[]
        for index,sequence in enumerate(sequences):
            if reverseComplementAmplicon :
                bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence).reverse_complement(), '%s-%s' % (str(index),str(sequence))))
            else:
                bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence), '%s-%s' % (str(index),str(sequence))))

        fastaPath = getTempFileName('art')+'.fa'
        SeqIO.write(bpythonSeqs, fastaPath, "fasta")

        os.system('%s -amp -i %s -l %s -f %s -o %s -sam -ss HS25' %(self.executable, fastaPath, readSize, coverage, prefix))
Example #5
0
def combine_result(INS, DEL):
	result = list()
	# INS_chr_pos_len_#_seq_rc_dp
	for i in INS:
		for j in i:
			if len(j) != 8:
				continue
			key = "%s*%s*%d*%d*%s*%d*%d"%(j[0], j[1], j[2], j[3], j[4], j[6], j[7])
			fake_seq = SeqIO.SeqRecord(seq = str(), id = key, name = key, description = key)
			fake_seq.seq = Seq(j[5])
			result.append(fake_seq)
	# INS = list()
	del INS
	gc.collect()
	# DEL_chr_pos_len_seq_rc_dp
	for i in DEL:
		for j in i:
			if len(j) != 7:
				continue
			key = "%s*%s*%d*%d*%d*%d"%(j[0], j[1], j[2], j[3], j[4], j[6])
			fake_seq = SeqIO.SeqRecord(seq = str(), id = key, name = key, description = key)
			fake_seq.seq = Seq(j[5])
			result.append(fake_seq)
	# DEL = list()
	del DEL
	gc.collect()
	# print INS+DEL
	# Temp = sorted(INS + DEL, key = lambda x:x[2])
	# result = list()
	# for i in Temp:
	# 	key = "%s_%s_%d_%d_%s"%(i[0], i[1], i[2], i[3], i[4])
	# 	fake_seq = SeqIO.SeqRecord(seq = str(), id = key, name = key, description = key)
	# 	fake_seq.seq = Seq(i[5])
	# 	result.append(fake_seq)
	return result
Example #6
0
 def __init__(self, score, name1, start1, match_size1, strand1,
     size1, seq1, name2, start2, match_size2, strand2, size2, seq2):
     self.score = score
     self.name1 = name1
     self.start1 = int(start1) # origin-zero
     self.match_size1 = int(match_size1)
     self.strand1 = strand1
     self.size1 = int(size1)
     self.seq1 = SeqIO.SeqRecord(Seq(seq1.rstrip('\n')))
     self.name2 = name2
     self.start2 = int(start2) # origin-zero, orientation dependent
     self.match_size2 = int(match_size2)
     self.strand2 = strand2
     self.size2 = int(size2)
     self.seq2 = SeqIO.SeqRecord(Seq(seq2.rstrip('\n')))
Example #7
0
def extract_cds_and_protein_fasta_from_rna(fpath):
    basename = os.path.splitext(fpath)[0]
    cds_fpath = basename + '_extracted_cds.fa'
    prot_fpath = basename + '_extracted_protein.fa'
    with open(cds_fpath, 'w') as cds_out, open(prot_fpath, 'w') as prot_out:
        for rec, gene_id, rna_id, prot_id, cds, prot in iterate_ncbi_rna_cds_and_tranlation(fpath):
            cds_rec = SeqIO.SeqRecord(Seq.Seq(cds.upper()),
                    rna_id,
                    '',
                    'gene_id=%s prot_id=%s' % (gene_id, prot_id))
            prot_rec = SeqIO.SeqRecord(Seq.Seq(prot.upper()),
                    prot_id,
                    '',
                    'gene_id=%s rna_id=%s' % (gene_id, rna_id))
            SeqIO.write(cds_rec, cds_out, 'fasta')
            SeqIO.write(prot_rec, prot_out, 'fasta')
Example #8
0
def get_window(genome, position, radius):
    seqid, pos = position
    assert genome[seqid][pos:pos + 2].upper() == 'CG'
    start = pos - radius
    end = pos + radius + 2
    seq = genome[seqid][start:end]
    return SeqIO.SeqRecord(seq=seq, id=f'{seqid}:{start}-{end}')
Example #9
0
def main():
    log = CreateLogger()
    params = ParseCommandLineParams(log)
    from Bio import SeqIO
    log.info("Reading reads from %s" % params.repertoire_path)
    records = []
    with open(params.repertoire_path) as input_file:
        header = input_file.readline().split("\t")
        sequence_column = header.index("Clonal sequence(s)")
        size_column = header.index("Clone count")
        id = 0
        for line in input_file:
            if len(line) == 0:
                break
            info = line.split("\t")
            from Bio import Seq
            record = SeqIO.SeqRecord(seq=Seq.Seq(info[sequence_column]),
                                     id="cluster___%d___size___%d" %
                                     (id, int(info[size_column])),
                                     description="")
            records.append(record)
            id += 1

    log.info("Read %d reads" % len(records))

    log.info("Writing output")
    with smart_open(params.output_path, "w") as output_file:
        for record in records:
            SeqIO.write(record, output_file, "fasta")
Example #10
0
def AnalyzeRegion(options, RegionSequenceSource):

    region_name = options.RegionName

    if not region_name:
        print("Region name undefined.")
        exit(1)

    sequences = LoadSequences(options, region_name)
    source_seq = RegionSequenceSource.fetchGeneSequence(region_name)

    if source_seq is None:
        return 0, 0

    TemplateProtein = SeqIO.SeqRecord(source_seq.translate(),
                                      id=region_name,
                                      description="")

    AllRegionSequences = []
    for i in range(100):
        AllRecommendedWindows, AllRegionSequences, TotalSequences =\
            EvaluateAllSequencesAllTranslationWindows(options,
                                                      TemplateProtein,
                                                      sequences)

        if len(list(set(AllRecommendedWindows))) == 1:
            print("Found correct window.")
            break

    if options.WriteFiles:
        BuildOutputAlignments(options, region_name, AllRegionSequences,
                              TemplateProtein)

    print("Rate for %s: %.2f%%" % (region_name, 0))
    return 0, 0
Example #11
0
def writeFastaFile(sequences, fileName):
    '''
    write a set of sequences to a fasta file.
    returns the name of the new file
    '''

    primerSequenceIdent = "primer_sequences"
    utils.logMessage(
        "PrimerManager::writeFastaFile( )",
        "Writing {0} sequences to fasta file".format(len(sequences)))
    seqRecords = []
    i = 0
    for sequence in sequences:
        seqStr = str(reduce(lambda x, y: str(x) + str(y), sequence))
        seqRecord = SeqIO.SeqRecord(Seq.Seq(seqStr,
                                            Alphabet.IUPAC.extended_dna),
                                    id="seq_{0}".format(i))
        seqRecords.append(seqRecord)
        i += 1

    SeqIO.write(seqRecords, open(fileName, "w"), "fasta")

    utils.logMessage("PrimerManager::writeFastaFile( )",
                     "writing fasta file complete")
    return fileName
Example #12
0
def read_genes(marburg_genome=None):  # read all <genome_name>.csv files for accessing genes in genomes
    global all_genes, edit_distance_matrices
    if marburg_genome:
        align_and_find_genes(marburg_genome)  # Align all genes in marburg genome
        genomes = ebolavirus_genomes + [marburg_genome]
    else:
        genomes = ebolavirus_genomes
    for gene in marburg_genes:  # For every gene (7 genes)
        i = 0
        genes = []
        for genome in genomes:  # For every species in ebolavirus
            indices = pd.read_csv("./Output/found_genes/" + genome.name + ".csv", header=None)  # read .csv file
            begin_idx = int(indices.loc[i, 1])  # begin index for special gene
            end_idx = int(indices.loc[i, 2])  # end index for special gene
            new_record = SeqIO.SeqRecord(genome.seq[begin_idx: end_idx])  # Create SeqRecord Object File
            new_record.name = genome.name
            genes.append(new_record)  # Append to gene list
            i += 1
        all_genes[gene.name] = genes  # Append genelist to all_genes dictionary
    if marburg_genome:
        edit_distance_matrices = [[[0 for i in range(6)] for j in range(6)] for k in
                                  range(7)]  # matrix for edit distances
    else:
        edit_distance_matrices = [[[0 for i in range(5)] for j in range(5)] for k in
                                  range(7)]  # matrix for edit distances
    if marburg_genome:
        global_align(with_marburg=2)
    else:
        global_align(with_marburg=1)
Example #13
0
    def run_hhblits_cns(self):
        # Generates multiple sequence alignment using hhblits

        seq = Seq(self.protein.str_seq_full.upper())
        target = self.target
        version = self._MSA_FULL_VERSION

        sequence = SeqIO.SeqRecord(seq, name=target, id=target)
        query = os.path.join(PATHS.msa, 'query', target + '_full.fasta')
        SeqIO.write(sequence, query, "fasta")
        output_hhblits = os.path.join(PATHS.hhblits, 'a3m',
                                      target + '_full.a3m')
        output_reformat1 = os.path.join(PATHS.hhblits, 'a2m',
                                        target + '_full.a2m')
        output_reformat2 = os.path.join(PATHS.hhblits, 'fasta',
                                        target + '_full_v%s.fasta' % version)

        db_hh = '/cs/zbio/orzuk/projects/ContactMaps/data/MSA_Completion/hh/uniprot20_2016_02/uniprot20_2016_02'

        hhblits = [
            'hhblits', '-i', query, '-d', db_hh, '-n', '3', '-e', '1e-3',
            '-maxfilt', '10000000000', '-neffmax', '20', '-nodiff', '-realign',
            '-realign_max', '10000000000', '-oa3m', output_hhblits
        ]
        subprocess.run(hhblits)
        reformat = ['reformat.pl', output_hhblits, output_reformat1]
        subprocess.run(reformat)

        reformat = ['reformat.pl', output_reformat1, output_reformat2]
        subprocess.run(reformat)
def generate_decoy_sequences(orig_seqs,
                             decoy_prefix='r',
                             decoy_type='reverse'):
    """
    Generate decoy BioSeq entries
    Decoy sequence can be conditionally altered to be reversed or shuffled
    Decoy IDs are prefixed with desired character
    """

    if decoy_type == 'reverse':

        def decoy_func(seq):
            return seq[::-1]
    elif decoy_type == 'shuffle':

        def decoy_func(seq):
            seq_list = list(seq)
            random.shuffle(seq_list)
            return ''.join(seq_list)
    else:
        raise ValueError('Unknown decoy type: {}'.format(decoy_type))

    decoy_fastas = list()
    for orig_fasta in orig_seqs:
        rev_seq = decoy_func(orig_fasta.seq)
        rev_id = '{}{}'.format(decoy_prefix, orig_fasta.id)

        decoy_entry = SeqIO.SeqRecord(seq=rev_seq,
                                      id=rev_id,
                                      name=orig_fasta.name,
                                      description=orig_fasta.description)
        decoy_fastas.append(decoy_entry)
    return decoy_fastas
Example #15
0
 def _write_fasta(self):
     s = self.protein.str_seq if self._family is None else self.str_seq
     if s is None:
         return
     seq = Seq(s.upper())
     sequence = SeqIO.SeqRecord(seq, name=self.target, id=self.target)
     SeqIO.write(sequence, self.fasta_fname, "fasta")
Example #16
0
    def _run_hhblits(self):
        # Generates multiple sequence alignment using hhblits
        if self.protein.str_seq is None:  # or self._train:
            return
        seq = Seq(self.protein.str_seq.upper())

        target = self.target
        version = self._MSA_VERSION

        sequence = SeqIO.SeqRecord(seq, name=target, id=target)
        target_hhblits_path = get_target_hhblits_path(target)
        check_path(target_hhblits_path)

        query = os.path.join(target_hhblits_path, target + '.fasta')
        SeqIO.write(sequence, query, "fasta")
        output_hhblits = os.path.join(target_hhblits_path, target + '.a3m')
        output_reformat1 = os.path.join(target_hhblits_path, target + '.a2m')
        output_reformat2 = os.path.join(target_hhblits_path,
                                        target + '_v%s.fasta' % version)

        db_hh = os.path.join(PATHS.hhblits, "scop40")

        hhblits_params = '-n 3 -e 1e-3 -maxfilt 10000000000 -neffmax 20 -nodiff -realign_max 10000000000'

        hhblits_cmd = f'hhblits -i {query} -d {db_hh} {hhblits_params} -oa3m {output_hhblits}'
        subprocess.run(hhblits_cmd, shell=True)
        # subprocess.run(hhblits_cmd, shell=True, stdout=open(os.devnull, 'wb'))
        reformat_script = os.path.join(PATHS.periscope, 'scripts',
                                       'reformat.pl')
        reformat = f"perl {reformat_script} {output_hhblits} {output_reformat1}"
        subprocess.run(reformat, shell=True)

        reformat = f"perl {reformat_script} {output_reformat1} {output_reformat2}"
        subprocess.run(reformat, shell=True)
def Naive_assembler(file, threshold = 2):
    records = list(SeqIO.parse(file, format=file.split(".")[-1]))
    unable_to_merge = False
    while len(records) != 1 and unable_to_merge != True:
        print(len(records))
        i = 0
        for record in records:
            i +=1
            record.id = str(i)
        alignments = []
        for i, record1 in enumerate(records):
            for j, record2 in enumerate(records):
                if i >= j:
                    continue
                #print(record1.id, record2.id)
                score, alignment = local_align(str(record1.seq), str(record2.seq))
                alignments.append([score, alignment, record1, record2])
        alignments.sort(key= lambda element: element[0], reverse=True)
        records_merged = set()
        new_reads = []
        i = 0
        if alignments[i][0] < threshold:
            unable_to_merge = True
        for i in range(len(alignments)):
            if alignments[i][0] < threshold:
                break
            if records_merged.intersection(set([record.id for record in alignments[i][2:]])):
                continue
            new_reads.append(SeqIO.SeqRecord(id=str(i), seq=merge_seqs(alignments[i][1])))
            for element in alignments[i][2:]:
                records_merged.add(element.id)
        new_reads += [record for record in records if record.id not in records_merged]
        records = new_reads
    return records
Example #18
0
 def parse_result(self, genome_path):
     result_path = genome_path + '.fasta.lst'
     if not os.path.isfile(os.path.expanduser(result_path)):
         return
     contigs = dict([(s.id, s.seq)
                     for s in SeqIO.parse(open(genome_path), 'fasta')])
     with open(result_path, 'r') as f:
         reading_genes = False
         for line in f.readlines():
             if line.startswith('    #'):
                 reading_genes = True
                 continue
             if line.startswith('---') or line.strip() == '':
                 reading_genes = False
             if reading_genes:
                 gene_sp = re.split(r'[\t ]+', line.strip())
                 seq_id = contig_id + '_gene_' + gene_sp[0]
                 l_index = int(gene_sp[2].replace('<', '')) - 1
                 r_index = int(gene_sp[3].replace('>', ''))
                 seq = contig_seq[l_index:r_index]
                 yield SeqIO.SeqRecord(seq,
                                       id=seq_id,
                                       description='',
                                       name='')
             if line.startswith('FASTA definition line'):
                 contig_id = line.strip().replace('FASTA definition line: ',
                                                  '')
                 contig_seq = contigs[contig_id]
     os.remove(result_path)
     os.remove('gms.log')
     os.remove('GeneMark_hmm.mod')
Example #19
0
def add_isolates(dic_list, db_name):
    for assembly in dic_list:
        print("Adding {} to kraken stagging area.".format(
            assembly['organism']),
              file=sys.stderr)
        genbank_zip_file = assembly['dest']
        fi = gzip.open(genbank_zip_file, 'rt')
        seqs = list(SeqIO.parse(fi, 'genbank'))
        new_seqs = []
        for s in seqs:
            tmp = SeqIO.SeqRecord(s.seq)
            tmp.id = 'gi|{}'.format(s.annotations['gi'])
            tmp.description = s.description
            tmp.name = s.name
            new_seqs.append(tmp)
        fi.close()
        fa_file = os.path.join(
            os.getcwd(),
            os.path.basename(genbank_zip_file).strip('gbff.gz') + ".fa")
        tmpf = open(fa_file, 'wt')
        SeqIO.write(new_seqs, tmpf, 'fasta')
        tmpf.close()
        kraken_add(db_name, fa_file)
        # cmd = 'kraken-build --add-to-library {} --db {}'.format(fa_file, db_name)
        # print(cmd, file = sys.stderr)
        # cmd = shlex.split(cmd)
        # p = subprocess.check_output(cmd)
        # os.remove(fa_file)
    print(
        "Added all {} assemblies to kraken stagging area. DB is ready to build"
        .format(len(dic_list)),
        file=sys.stderr)
def run(parameters):
    message = "All correct."
    APPLOGGER.info("Reading file...")
    handle = None
    filekind = filetype.guess(parameters['input'])
    if filekind and filekind.extension in ['gz', 'GZ', 'gZ', 'Gz']:
        APPLOGGER.info("Running gzip...")
        handle = gzip.open(parameters['input'], "rt")
    else:
        handle = open(parameters['input'], "rt")
    APPLOGGER.info("Creating output file...")
    out = open(parameters['output'], "wb")
    APPLOGGER.info("Parsing FASTA file...")
    for record in SeqIO.parse(handle, "fasta"):
        chunks, chunk_size = len(
            record.seq), len(record.seq) / parameters['size']
        print(chunks, chunk_size)
        subseqs = [
            record.seq[i:i + chunk_size] for i in range(0, chunks, chunk_size)
        ]
        num_digits = len(str(len(subseqs)))
        for index in range(0, len(subseqs)):
            seq=SeqIO.SeqRecord(\
                seq=Seq.Seq(subseqs[index]),\
                id="{0}_{1:0{2}d}".format(record.id, index, num_digits),\
                description=""
            )
            SeqIO.write(seq, out, "fasta")
    handle.close()
    out.close()
    APPLOGGER.info("Closing files...")
    return True, message
def process_pbp(isolate, protein_fasta, tpd_start, tpd_end, tpd_lab,
                results_csv, k):
    protein_rec = list(SeqIO.parse(protein_fasta, format="fasta"))
    prot = protein_rec[0].seq

    tpd_string = str(prot[tpd_start:tpd_end])
    tpd_string = Seq(tpd_string, generic_protein)
    prot_tpd_id = isolate + "_" + tpd_lab + "_TPD"
    tpd_protein_string = SeqIO.SeqRecord(tpd_string, id=prot_tpd_id)

    prot_file = isolate + "_" + tpd_lab + ".prot"
    with open(prot_file, "w+") as output_handle:
        SeqIO.write(tpd_protein_string, output_handle, "fasta")

    top_id = str(results_csv.iloc[0, 1])
    top_id = top_id[2:]

    if bassio_nameo != "22841_3#15.contigs_velvet.fa.gff":

        rm_command = "rm " + protein_fasta + " " + protein_csv
        os.system(rm_command)
    else:
        print(sstart, send)
        print(tpd_start, tpd_end)

    print("Generating CSV: %s%%" % round((k / len(gff_lines) * 100)))
Example #22
0
def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--p_i_range", nargs=2, type=float, required=True)
    parser.add_argument("--p_ji_range", nargs=2, type=float, required=True)
    parser.add_argument("--n_seq", type=int, required=True)
    parser.add_argument("--n_pairs", type=int, required=True)
    parser.add_argument("--n_randoms", type=int, required=True)
    parser.add_argument("--seed", type=int, default=42)

    args = parser.parse_args()

    np.random.seed(args.seed)

    dependent = generate_sample(args.p_i_range, args.p_ji_range, args.n_seq,
                                args.n_pairs)
    independent = random_sample(args.n_seq, args.n_randoms)

    seqs = [x + y for x, y in zip(dependent, independent)]

    records = [
        SeqIO.SeqRecord(
            seq=Seq.Seq(seq),
            id=str(i),
            description="",
        ) for i, seq in enumerate(seqs)
    ]

    SeqIO.write(records, sys.stdout, "fasta")
def return_seq_record_with_features_from_genbank(r, part_id):
    # take a request object and add the features to it

    sequence_from_ice = r.json()
    amb = IUPACAmbiguousDNA()
    sequence_rec = SeqIO.SeqRecord(Seq(sequence_from_ice['sequence'], amb),
                                   id=part_id,
                                   name=part_id)

    for feature in sequence_from_ice['features']:
        # add_feature(sequence_rec=sequence_rec)
        start_position = feature['locations'][0]['genbankStart']
        end_position = feature['locations'][0]['end']
        strand = feature['strand']
        name_of_feature = feature['name']
        type_of_feature = feature['type']
        id_of_feature = feature['id']

        my_feature = add_feature(sequence_rec=sequence_rec,
                                 start_postion=start_position,
                                 end_position=end_position,
                                 strand=strand,
                                 name=name_of_feature,
                                 feature_type=type_of_feature,
                                 feature_id=id_of_feature)

        sequence_rec.features.append(my_feature)

    return sequence_rec
Example #24
0
def find_upseq(loc, genome, n_up=1000, n_down=200):
    contig = genome[loc[1]]
    print(loc)
    if loc[2] == "+":
        upseq = contig.seq[loc[3]-1-n_up:loc[4]+n_down]
    else:
        upseq = contig.seq[loc[3]-1-n_down:loc[4]+n_up].reverse_complement()
    return SeqIO.SeqRecord(upseq.upper(), loc[0], description=str(n_up)+"_bases_upstream")
Example #25
0
 def output(record):
     record = SeqIO.SeqRecord(
         record.reverse_complement().seq,
         letter_annotations=record.letter_annotations,
         id=record.id,
         name=record.name,
         description=record.description)
     return _output(record)
Example #26
0
 def setUp(self):
     # Create a default FASTA record
     self.default_fasta_record = SeqIO.SeqRecord(
         seq="ACTGAAC",
         id="testid",
         name="testname",
         description="sequence_name | organism=testorganism | SO=chromosome"
     )
Example #27
0
def test_make_records_to_dictionary():
    TEST_RECORD_LIST = [
        SeqIO.SeqRecord(
            Seq("ATGCTCGTAGCTGATCGA"),
            id="test1",
            name="test1",
            description="test record #1",
        ),
        SeqIO.SeqRecord(
            Seq("GTGCTCGTAGCTGATCGA"),
            id="test2",
            name="test2",
            description="test record #2",
        ),
    ]
    EXPECTED = {"test1": TEST_RECORD_LIST[0], "test2": TEST_RECORD_LIST[1]}
    actual = process.make_records_to_dictionary(TEST_RECORD_LIST)
    assert EXPECTED == actual
Example #28
0
def _dump_fasta(inferred_reference, description, file_path):
    inferred_reference = ''.join(inferred_reference)

    seq = Seq.Seq(inferred_reference, Seq.IUPAC.unambiguous_dna)
    seq_record = SeqIO.SeqRecord(seq, '', description=description)
    record = [seq_record]

    log.debug("Writing fasta reference:\n%s\n%s", file_path, description)
    SeqIO.write(record, file_path, "fasta")
Example #29
0
def read_gbk(genome: str) -> SeqIO.SeqRecord:
    """Reads the input genome file and concatenates all contigs into a single SeqRecord."""
    whole_record = SeqIO.SeqRecord(
        seq="", id="", name="",
        features=None)  # Blank SeqRecord that will be added to
    for record in SeqIO.parse(
            handle=genome,
            format='genbank'):  # Merge all contigs into one large SeqRecord
        whole_record += record
    return whole_record
Example #30
0
def get_genome_subrecord(chromosome, loc1, loc2):
    """Creates a SeqRecord DNA sequence of the given chromosome from loc1 to loc2."""
    global recent_record
    if recent_record is None or recent_record.id != 'chr{}'.format(chromosome):
        recent_record = SeqIO.read("../data/hg19/chr{}.fa".format(chromosome),
                                   "fasta")

    id_str = ":".join(['chr{}'.format(chromosome), str(loc1), str(loc2)])
    new_seq = str(recent_record[loc1:loc2].seq).upper()
    return SeqIO.SeqRecord(Seq.Seq(new_seq), id=id_str)