Esempio n. 1
0
def mainfunc(data, name, numrows):
    '''
    This function searches for primer and anchor in reads and pulls out STRseq between them.
    @param data: The tibble/data frame.
    @param name: The output file name.
    @param numrows: Number of rows of the input data to be used to generate random UMIs
    @return: Writes a output file with STRseq and other metadata.
    '''
    mypydata = data.set_index('ID').T.to_dict('list')
    UmiSTRLociList = []
    counter_P_A = 0
    counter_P = 0
    umilistIdx = 0
    umilist = umidict(numrows)
    for mydatakey, mydataitems in mypydata.items():
        ID = mydatakey
        ReadCount = mydataitems[0]
        readR1 = mydataitems[1]
        for key, items in dict_primer.items():
            Pos = key
            anchor = items[4]
            anchorIndex = strfuzzy.fuzzyFind(readR1, anchor, fuzz=1)
            primer = items[3]
            if readR1.startswith(primer, 0, len(primer)):
                counter_P += 1
            if ((readR1.startswith(primer, 0, len(primer))) and (anchorIndex >= 0)):
                Loci = items[0]
                STRseq =  readR1[len(primer):anchorIndex]
                counter_P_A += 1
                umi = umilist[umilistIdx]
                umilistIdx +=1
                #print("umilist index: %s" % umilistIdx)
                #UMI = ''.join(random.choice('AGTC') for _ in range(12))
                UmiSTRLociList.append((Loci, STRseq, umi, primer, anchor, ReadCount))
    
    #print(len(umilist))            
    UmiSTRLociCount = collections.defaultdict(int)
    for k in UmiSTRLociList:
      UmiSTRLociCount[k] += 1
    
    outfilename = name + "_noN.tsv"
    with open(outfilename, 'w') as fh:
      fh.writelines("Number of Primer match = %d, Number of Primer and Anchor = %d\n" % (counter_P, counter_P_A))
      fh.writelines('{}\t{}\n'.format('\t'.join(k),v) for k,v in UmiSTRLociCount.items() )
            else:
                pass
            dict_primer_empty[keyPos] = [
                val1Locus, val2Chr, val3Strand, val4Primer, val5Anchor
            ]
    #sprint (testcount)
    return dict_primer_empty


dict_primer = {}

file_primer = "PrimedAnchors.txt"
dict_for_primer(file_primer, dict_primer)
#print(dict_primer)

s = "CCCACACGGCCTGGCAACTTATATGTATTTTTGTATTTCATGTGTACATTCGTATCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTCCCCACAGTGAAAATAATCTACAGGATAGGTAAATAAATTAAGGCATATTCACGCAATGGGATACGATACAGTGATGAAAATGAACTAATTATAGCTACGTGAAACTATACTCATGAACACAATTTTGTAAAAGAAACAGGACTCCAATTTTCGCTCTTCC"

for items in dict_primer.items():
    #print(items[1][3])
    #myregex = re.escape(items[1][3]) + (r".*") + re.escape(items[1][4])
    anchor = items[1][4]
    anchorIndex = strfuzzy.fuzzyFind(s, anchor, fuzz=1)
    primer = items[1][3]
    if (s.startswith(primer, 0, len(primer)) and (anchorIndex >= 0)):
        #print(re.match(r'%s(.*)%s' % (primer, anchor), s).group(1))
        print(s[len(primer):anchorIndex])
        #m = re.match(r'%s.*' % items[1][3], s)

        #print(r'%s(.*)
    else:
        print('no match')
dict_fastq_R1 = dict_for_fastq(file_fastq_R1)
dict_fastq_R2 = dict_for_fastq(file_fastq_R2)
UmiSTRLociList = []
counterCS_P_A = 0
counterCS_P = 0
counterCS = 0
counter_noCS_match = 0
for key in set(dict_fastq_R1) & set(dict_fastq_R2):
    readR1 = dict_fastq_R1[key]
    readR2 = dict_fastq_R2[key]
    if re.match(r'(.{12})(ATTGGAGTCCT)', readR2) is not None:
        counterCS += 1
        for items in dict_primer.items():
            # do fuzzy matching of anchor
            anchor = items[1][4]
            anchorIndex = strfuzzy.fuzzyFind(readR1, anchor, fuzz=1)
            primer = items[1][3]
            if readR1.startswith(primer, 0, len(primer)):
                counterCS_P += 1
            if ((readR1.startswith(primer, 0, len(primer))) \
                and (anchorIndex >= 0)):
                Loci = items[1][0]
                STRseq =  readR1[len(primer):anchorIndex]
                searchCS = re.match(r'(.{12})(ATTGGAGTCCT)', readR2)
                UMI = searchCS.group(1)
                counterCS_P_A += 1
                print (counterCS_P_A)
                UmiSTRLociList.append((Loci, STRseq, UMI, primer, anchor))
    else:
        counter_noCS_match += 1
s = "CAAAAGCCTATTTAGTCAGGGTTTTCCAAGAGATAGACCAATTAGATAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAAAAGATAGATATGAGTAGGGATTTATGAGAGGAATTGGCTCACATGATGATGGAGGCTGAGAATTCTAGGACTCCAATACCTTTTAGC"
s_1 = "GAAAAGCCTATTTAGTCAGGGTTTTCCAAGAGATAGACCAATTAGATAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAAAAGATAGATATGAGTAGGGATTTATGAGAGGAATTGGCTCACATGATGATGGAGGCTGAGAATTCTAGGACTCCAATACCTTTTAGC"
s_2 = "GGAAAGCCTATTTAGTCAGGGTTTTCCAAGAGATAGACCAATTAGATAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAAAAGATAGATATGAGTAGGGATTTATGAGAGGAATTGGCTCACATGATGATGGAGGCTGAGAATTCTAGGACTCCAATACCTTTTAGC"

primer = "CAAAAGCCTATTTAGTCAGGGTTTTCCAAGAGATAG"
anchor = "TTGGCTCACAT"

import strfuzzy

primer_fuzz_tup = strfuzzy.fuzzyFind(s, primer, fuzz=2, end=len(primer))
print(primer_fuzz_tup[0], primer_fuzz_tup[1], primer_fuzz_tup[2])