def main(): print('Reading fasta file...') sequences = parse_fasta('COMP.txt') print('Computing pairwise edit distances...', end='', flush=True) D = {(a, b): -align_score(sequences[a], sequences[b]) for a in sequences for b in sequences} print() print('Performing center star alignment...', end='', flush=True) center = findCenterSeq(sequences) profile = centerStar_align(center, sequences)
def main(): print('Reading fasta file...') sequences = parse_fasta('COMP.txt') print('Performing center star alignment...', end='', flush=True) center = findCenterSeq(sequences) alignment = centerStar_align(center, sequences) with open('center_start_alignment.txt', 'w') as f: f.write('\n'.join(['{}: {}'.format(key, value) for key, value in alignment.items()])) for name in alignment: print(alignment[name])
def load_aa_positions(fasta, aa): ''' For every protein sequence in `fasta` builds an index of aminoacids specified in `aa` Returns long DataFrame ''' def build_aa_df(name, seq, aa): res = [] for i,c in enumerate(seq): if c in aa: res.append({'prot': name, 'aa': c, 'pos': i+1}) return pd.DataFrame.from_records(res) aa_map = [] for name,seq in parse.parse_fasta(fasta): aa_map.append(build_aa_df(name, seq, aa)) return pd.concat(aa_map)
from parse import parse_fastq from teitlib import * from time import sleep from time import time import os import re from fnmatch import fnmatch start_time = time() """ seqs1 = parse_fastq('SRR11397721_1.fastq',438663) seqs2 = parse_fastq('SRR11397721_2.fastq',438663) seqs = seqs1 + seqs2 """ name, ref = parse_fasta("reference.fasta") #aname, test0 = parse_fasta("aligned_reference.fasta") aname,test0 = parse_fasta('align_seqs_1og2.fasta') #aname, test0 = parse_fasta("align_seqs1.fasta") with open('nomatches_seqs1og2.txt') as f: seqs = f.read().splitlines() pname,morning = parse_fasta('morning.fasta') p = "ATTCTGCTGTCAAATTACAGAATAATGAGCTTAG" """ matching = [i for i in seqs if ref.find(i) > -1] c = 0 for i in range(len(matching)): if matching[i].find(p) >= 0: c+=1 print("c:",c)
import os import re from fnmatch import fnmatch start_time = time() """ seqs1 = parse_fastq('SRR11397721_1.fastq',438663) seqs2 = parse_fastq('SRR11397721_2.fastq',438663) seqs = seqs1 + seqs2 """ with open('nomatches_seqs1og2.txt') as f: seqs = f.read().splitlines() #aname,test0 = parse_fasta('align_seqs_1og2.fasta') name, ref = parse_fasta("reference.fasta") aname, test0 = parse_fasta("s100.fasta") # aligns list of sequences to a reference def align(Text, Pattern): s = "x" * len(Text) s = list(s) for i in range(len(Pattern)): index = Text.find(Pattern[i]) if index >= 0: s[index:index + len(Pattern[i])] = list(Pattern[i]) return string(s) # returns all missing indices
parser = argparse.ArgumentParser(description=''' pScatter Interaction visualizer for pLink XLMS data, by Justin Jee (with design by Katelyn McGary Shipper) ''' ) parser.add_argument('fasta_file', type=str, help='fasta file with protein sequences') parser.add_argument('xwalk_file', type=str, help='XWalk output file') parser.add_argument('-a', '--aminoacid', default='K', help='cross-linkable aminoacids. Defaults to Lysine (K).') args = parser.parse_args() fd = {} ld = {} with open(args.fasta_file, 'r') as fi: for name,seq in parse.parse_fasta(fi): fd[name] = [m.start() + 1 for m in re.finditer(args.aminoacid, seq)] ld[name] = len(seq) #xwalk # Line # 1 uvrdnap_ecmodel.pdb LYS-486-A-CB LYS-496-B-CB 652 10.7 11.5 - - - with open(args.xwalk_file, 'r') as fi: xd = {} for line in fi: temparray = line.split() temp1 = temparray[2].split('-') temp2 = temparray[3].split('-')
from parse import parse_fasta from parse import parse_fastq from teitlib import * from time import sleep from time import time import re start_time = time() seqs1 = parse_fastq('SRR11397721_1.fastq', 438663) seqs2 = parse_fastq('SRR11397721_2.fastq', 438663) seqs = seqs1 + seqs2 # delete duplicates seqs = list(dict.fromkeys(seqs)) name, ref = parse_fasta("fasta/reference.fasta") # aligns list of sequences to a reference # returns all missing indices def get_missing_indices(aligned_reference): missing_indices = [] for i in range(len(aligned_reference)): curr = aligned_reference[i] if curr == 'x': missing_indices.append(i) return missing_indices # returns start and end of missing indices def get_missing_indices_brief(aligned_reference):
from parse import parse_fasta from parse import parse_fastq from teitlib import * from time import time start = time() refname,ref = parse_fasta('reference.fasta') aliname,ali = parse_fasta('align_seqs_1og2.fasta') #aliname,ali = parse_fasta('align_seqs1.fasta') #aliname,ali = parse_fasta('align12_reverse.fasta') #aliname,ali = parse_fasta('aligned12_wduplicates.fasta') seqs = parse_fastq('SRR11397721_1.fastq',438663) with open('nomatches_seqs1og2.txt') as f: nomatch = f.read().splitlines() matches = [i for i in seqs if ref.find(i) > -1] printList(matches) mi = [i for i in range(len(ali)) if ali[i] == 'x'] def get_start_and_end_ofxinterval(li): ret = [] ret.append(li[0]) for i in range(len(li)-1): if li[i] != li[i+1]-1: ret.append(li[i]) ret.append(li[i+1])