def get_part_sequence(self, fasta_file, header, start, stop, nterminus, strand, name): """ Pull part sequence from fasta file # https://github.com/mdshw5/pyfaidx # pip install pyfaidx Parameters ---------- Args: fasta_file (str): input fasta file header (str): header for fasta sequence start (str): start coordinate stop (str): stop coordinate nterminus (int): length of missing sequence strand (str): strand name (str): gene name Returns: sequence (str): portion on a sequence """ # remove the last 2 characters from header as this is appended by prodigal header = header[:header.rfind("_")] # logger.info("[PARTIAL] ARO: {} | contig: {} | filename: {}".format(name, header, fasta_file)) genes = Fasta(fasta_file, sequence_always_upper=False, read_long_names=False, one_based_attributes=True) # logger.info(genes.records) logger.info(json.dumps({"strand":strand, "start":start, "stop":stop, "nterminus":nterminus}, indent=2)) if strand == "-": return str(genes.get_spliced_seq( header, [[stop, stop+nterminus]])) elif strand == "+": return str(genes.get_spliced_seq( header, [[start-nterminus, start]]))
def test_split_seq(self): """ Fetch sequence by blocks """ fa = Fasta('data/chr17.hg19.part.fa') gene = Fasta("data/gene.bed12.fasta") expect = gene[list(gene.keys())[0]][:].seq bed = "data/gene.bed12" with open(bed) as fi: record = fi.readline().strip().split("\t") chrom = record[0] start = int(record[1]) strand = record[5] # parse bed12 format starts = [int(x) for x in record[11].split(",")[:-1]] sizes = [int(x) for x in record[10].split(",")[:-1]] starts = [start + x for x in starts] ends = [start + size for start, size in zip(starts, sizes)] # bed half-open if strand == "-": starts = [start + 1 for start in starts] else: ends = [end - 1 for end in ends] intervals = zip(starts, ends) result = fa.get_spliced_seq(chrom, intervals, rc=True) print(result.seq) print("====") print(expect) assert result.seq == expect
def test_split_seq(self): """ Fetch sequence by blocks """ fa = Fasta('data/chr17.hg19.part.fa') gene = Fasta("data/gene.bed12.fasta") expect = gene[list(gene.keys())[0]][:].seq bed = "data/gene.bed12" with open(bed) as fi: record = fi.readline().strip().split("\t") chrom = record[0] start = int(record[1]) strand = record[5] # parse bed12 format starts = [int(x) for x in record[11].split(",")[:-1]] sizes = [int(x) for x in record[10].split(",")[:-1]] starts = [start + x for x in starts] ends = [start + size for start,size in zip(starts, sizes)] # bed half-open if strand == "-": starts = [start + 1 for start in starts] else: ends = [end - 1 for end in ends] intervals = zip(starts, ends) result = fa.get_spliced_seq(chrom, intervals, rc=True) print(result.seq) print("====") print(expect) assert result.seq == expect
def get_part_sequence(self, fasta_file, header, start, stop, nterminus, strand, name): """ Pull part sequence from fasta file # https://github.com/mdshw5/pyfaidx # pip install pyfaidx Parameters ---------- Args: fasta_file (str): input fasta file header (str): header for fasta sequence start (str): start coordinate stop (str): stop coordinate nterminus (int): length of missing sequence strand (str): strand name (str): gene name Returns: sequence (str): portion on a sequence """ # remove the last 2 characters from header as this is appended by prodigal header = header[:header.rfind("_")] genes = False # logger.info("[PARTIAL] ARO: {} | contig: {} | filename: {}".format(name, header, fasta_file)) try: genes = Fasta(fasta_file, sequence_always_upper=False, read_long_names=False, one_based_attributes=True) except Exception as e: logger.error(e) # logger.info(genes.records) if genes: # logger.debug(json.dumps({"strand":strand, "start":start, "stop":stop, "nterminus":nterminus}, indent=2)) if strand == "-": _start = stop + 1 _stop = stop + nterminus # logger.debug("grep sequence from {}|-|{}-{}".format(header,_start, _stop,)) if nterminus == 0: # logger.debug("grep sequence from {}|-|{}-{}".format(header,start, stop,)) return str(genes.get_spliced_seq( header, [[start, stop]])), start, stop else: return str(genes.get_spliced_seq( header, [[_start, _stop]])), _start, _stop elif strand == "+": _start = start - nterminus _stop = start - 1 if _start <= 0: _start = 1 if _stop <= 0: _stop = 1 # logger.debug("grep sequence from {}|+|{}-{}".format(header,_start, _stop)) if nterminus == 0: # logger.debug("grep sequence from {}|+|{}-{}".format(header,start, stop)) return str(genes.get_spliced_seq( header, [[start, stop]])), start, stop else: return str(genes.get_spliced_seq( header, [[_start, _stop]])), _start, _stop
# cds = cds.sort_values(['chr','startGene']) for index,row in cds.iterrows(): print(index,row['id']) start = time.time() # Convert CDS list into numeric array coordinates = array(row['coordinates'].split(',')).astype(int).tolist() coordinates = [coordinates[i:i+2] for i in range(0, len(coordinates), 2)] # Open ref and outgroup ref = Fasta('/data/shared/dgn/ref/Chr' + row['chr'] +'.fasta',sequence_always_upper=True) outgroup = Fasta(outgroupFastas + '/Chr' + row['chr'] +'_dsim.fasta',sequence_always_upper=True) ## Extract ref and outgroup seq refSeq = ref.get_spliced_seq(list(ref.keys())[0], coordinates).seq outgroupSeq = outgroup.get_spliced_seq(list(outgroup.keys())[0], coordinates).seq # Check length divisible by 3 if((len(refSeq) % 3) == 0): # Open multifasta multiFasta = Fasta('/data/shared/dgn/alignments/'+ args.population + '_Chr' + row['chr'] +'.seq',sequence_always_upper=True) # Extract samples from fastas samples = list(multiFasta.keys()) # Create empty array with ndimesions equal to multi-Fasta lines and length matrix = np.empty([len(samples) + 1, len(refSeq)],dtype='str') positions=[]
for index, row in cds.iterrows(): print(index, row['id']) start = time.time() # Convert CDS list into numeric array coordinates = array(row['coordinates'].split(',')).astype(int).tolist() coordinates = [ coordinates[i:i + 2] for i in range(0, len(coordinates), 2) ] # Open ref and outgroup ref = Fasta('/data/shared/dgn/ref/Chr' + row['chr'] + '.fasta') outgroup = Fasta(outgroupFastas + '/Chr' + row['chr'] + '_dsim.fasta') ## Extract ref and outgroup seq refSeq = ref.get_spliced_seq(row['chr'], coordinates).seq.upper() outgroupSeq = outgroup.get_spliced_seq(outputHeader, coordinates).seq.upper() if ('M' in refSeq): continue else: if ((len(refSeq) / 3).is_integer()): # Open population multifasta popFasta = Fasta('/data/shared/dgn/alignments/' + args.population + '_Chr' + row['chr'] + '.seq') #Extract samples samples = list(popFasta.keys())