def mainfunc(data, name, numrows): ''' This function searches for primer and anchor in reads and pulls out STRseq between them. @param data: The tibble/data frame. @param name: The output file name. @param numrows: Number of rows of the input data to be used to generate random UMIs @return: Writes a output file with STRseq and other metadata. ''' mypydata = data.set_index('ID').T.to_dict('list') UmiSTRLociList = [] counter_P_A = 0 counter_P = 0 umilistIdx = 0 umilist = umidict(numrows) for mydatakey, mydataitems in mypydata.items(): ID = mydatakey ReadCount = mydataitems[0] readR1 = mydataitems[1] for key, items in dict_primer.items(): Pos = key anchor = items[4] anchorIndex = strfuzzy.fuzzyFind(readR1, anchor, fuzz=1) primer = items[3] if readR1.startswith(primer, 0, len(primer)): counter_P += 1 if ((readR1.startswith(primer, 0, len(primer))) and (anchorIndex >= 0)): Loci = items[0] STRseq = readR1[len(primer):anchorIndex] counter_P_A += 1 umi = umilist[umilistIdx] umilistIdx +=1 #print("umilist index: %s" % umilistIdx) #UMI = ''.join(random.choice('AGTC') for _ in range(12)) UmiSTRLociList.append((Loci, STRseq, umi, primer, anchor, ReadCount)) #print(len(umilist)) UmiSTRLociCount = collections.defaultdict(int) for k in UmiSTRLociList: UmiSTRLociCount[k] += 1 outfilename = name + "_noN.tsv" with open(outfilename, 'w') as fh: fh.writelines("Number of Primer match = %d, Number of Primer and Anchor = %d\n" % (counter_P, counter_P_A)) fh.writelines('{}\t{}\n'.format('\t'.join(k),v) for k,v in UmiSTRLociCount.items() )
else: pass dict_primer_empty[keyPos] = [ val1Locus, val2Chr, val3Strand, val4Primer, val5Anchor ] #sprint (testcount) return dict_primer_empty dict_primer = {} file_primer = "PrimedAnchors.txt" dict_for_primer(file_primer, dict_primer) #print(dict_primer) s = "CCCACACGGCCTGGCAACTTATATGTATTTTTGTATTTCATGTGTACATTCGTATCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTCCCCACAGTGAAAATAATCTACAGGATAGGTAAATAAATTAAGGCATATTCACGCAATGGGATACGATACAGTGATGAAAATGAACTAATTATAGCTACGTGAAACTATACTCATGAACACAATTTTGTAAAAGAAACAGGACTCCAATTTTCGCTCTTCC" for items in dict_primer.items(): #print(items[1][3]) #myregex = re.escape(items[1][3]) + (r".*") + re.escape(items[1][4]) anchor = items[1][4] anchorIndex = strfuzzy.fuzzyFind(s, anchor, fuzz=1) primer = items[1][3] if (s.startswith(primer, 0, len(primer)) and (anchorIndex >= 0)): #print(re.match(r'%s(.*)%s' % (primer, anchor), s).group(1)) print(s[len(primer):anchorIndex]) #m = re.match(r'%s.*' % items[1][3], s) #print(r'%s(.*) else: print('no match')
dict_fastq_R1 = dict_for_fastq(file_fastq_R1) dict_fastq_R2 = dict_for_fastq(file_fastq_R2) UmiSTRLociList = [] counterCS_P_A = 0 counterCS_P = 0 counterCS = 0 counter_noCS_match = 0 for key in set(dict_fastq_R1) & set(dict_fastq_R2): readR1 = dict_fastq_R1[key] readR2 = dict_fastq_R2[key] if re.match(r'(.{12})(ATTGGAGTCCT)', readR2) is not None: counterCS += 1 for items in dict_primer.items(): # do fuzzy matching of anchor anchor = items[1][4] anchorIndex = strfuzzy.fuzzyFind(readR1, anchor, fuzz=1) primer = items[1][3] if readR1.startswith(primer, 0, len(primer)): counterCS_P += 1 if ((readR1.startswith(primer, 0, len(primer))) \ and (anchorIndex >= 0)): Loci = items[1][0] STRseq = readR1[len(primer):anchorIndex] searchCS = re.match(r'(.{12})(ATTGGAGTCCT)', readR2) UMI = searchCS.group(1) counterCS_P_A += 1 print (counterCS_P_A) UmiSTRLociList.append((Loci, STRseq, UMI, primer, anchor)) else: counter_noCS_match += 1
s = "CAAAAGCCTATTTAGTCAGGGTTTTCCAAGAGATAGACCAATTAGATAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAAAAGATAGATATGAGTAGGGATTTATGAGAGGAATTGGCTCACATGATGATGGAGGCTGAGAATTCTAGGACTCCAATACCTTTTAGC" s_1 = "GAAAAGCCTATTTAGTCAGGGTTTTCCAAGAGATAGACCAATTAGATAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAAAAGATAGATATGAGTAGGGATTTATGAGAGGAATTGGCTCACATGATGATGGAGGCTGAGAATTCTAGGACTCCAATACCTTTTAGC" s_2 = "GGAAAGCCTATTTAGTCAGGGTTTTCCAAGAGATAGACCAATTAGATAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAAAAGATAGATATGAGTAGGGATTTATGAGAGGAATTGGCTCACATGATGATGGAGGCTGAGAATTCTAGGACTCCAATACCTTTTAGC" primer = "CAAAAGCCTATTTAGTCAGGGTTTTCCAAGAGATAG" anchor = "TTGGCTCACAT" import strfuzzy primer_fuzz_tup = strfuzzy.fuzzyFind(s, primer, fuzz=2, end=len(primer)) print(primer_fuzz_tup[0], primer_fuzz_tup[1], primer_fuzz_tup[2])