def main():
    parser = argparse.ArgumentParser(
        description=
        "Make a gpd with unique transcript names and a key to their original gpd entry\n"
    )
    parser.add_argument("gpd_infile", help="FILENAME genepred file")
    parser.add_argument("gpd_outfile", help="FILENAME genepred file")
    args = parser.parse_args()
    gfr = GenericFileReader(args.gpd_infile)
    seen = {}
    while True:
        line = gfr.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        f = line.split("\t")
        if f[1] not in seen:
            seen[f[1]] = []
        seen[f[1]].append(line)
    gfr.close()
    of_gpd = open(args.gpd_outfile, 'w')
    of_key = open(args.gpd_outfile + ".key_file", 'w')
    for tx in seen:
        for i in range(0, len(seen[tx])):
            name = tx
            if len(seen[tx]) > 1:
                name = tx + '[' + str(i + 1) + '/' + str(len(seen[tx])) + ']'
            f = seen[tx][i].split("\t")
            f[1] = name
            newline = "\t".join(f)
            of_key.write(name + "\t" + seen[tx][i] + "\n")
            of_gpd.write(newline + "\n")
    of_key.close()
    of_gpd.close()
예제 #2
0
 def __init__(self, filename):
     self.filename = filename
     self.gfr = GenericFileReader(filename)
     self.entries = []
     while True:
         line = self.gfr.readline()
         if not line: break
         if re.match('^#', line): continue
         gpe = GenePredEntry()
         gpe.line_to_entry(line)
         self.entries.append(gpe)
     return
예제 #3
0
 def set_mapping_counts(self, psl_filename):
     self.mapping_counts_set = True
     gfr0 = GenericFileReader(psl_filename)
     qcnts = {}
     while True:
         line = gfr0.readline()
         if not line: break
         try:
             psle = PSLBasics.line_to_entry(line.rstrip())
         except:
             sys.stderr.write("Problem parsing line:\n" + line.rstrip() +
                              "\n")
             continue
         if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0
         qcnts[psle['qName']] += 1
     gfr0.close()
     self.mapping_counts = qcnts
예제 #4
0
 def __init__(self,filename):
   self.filename = filename
   self.gfr = GenericFileReader(self.filename)
   self.previous_name = None
def main():
    parser = argparse.ArgumentParser(
        description=
        'Split FASTQ file(s) into smaller ones with as many entries as you specify'
    )
    parser.add_argument('size',
                        type=int,
                        help='Number of sequences to put into each file')
    parser.add_argument('output_directory',
                        help='Name of the directory to put sequences')
    parser.add_argument('fastq_files',
                        nargs='+',
                        help='FILENAME(S) for fastq files')
    args = parser.parse_args()
    if len(args.fastq_files) > 2:
        sys.stderr.write("ERROR only two fastq files at most are supported\n")
        return
    if os.path.exists(args.output_directory):
        sys.stderr.write("ERROR output directory exists already\n")
        return
    os.makedirs(args.output_directory)
    if len(args.fastq_files) == 1:
        out_iter = 1
        fcount = 0
        of = open(
            args.output_directory.rstrip('/') + '/' + str(out_iter) + '.fq',
            'w')
        gfr = sys.stdin
        if (args.fastq_files[0] != '-'):
            gfr = GenericFileReader(args.fastq_files[0])
        while True:
            lineA = gfr.readline()
            if not lineA: break
            lineB = gfr.readline()
            lineC = gfr.readline()
            lineD = gfr.readline()
            of.write(lineA)
            of.write(lineB)
            of.write(lineC)
            of.write(lineD)
            fcount += 1
            if args.size <= fcount:
                fcount = 0
                out_iter += 1
                of.close()
                of = open(
                    args.output_directory.rstrip('/') + '/' + str(out_iter) +
                    '.fq', 'w')
        gfr.close()
    else:  # we have two fastq files
        out_iter = 1
        fcount = 0
        of1 = open(
            args.output_directory.rstrip('/') + '/' + str(out_iter) + '_1.fq',
            'w')
        gfr1 = GenericFileReader(args.fastq_files[0])
        of2 = open(
            args.output_directory.rstrip('/') + '/' + str(out_iter) + '_2.fq',
            'w')
        gfr2 = GenericFileReader(args.fastq_files[1])
        while True:
            line1a = gfr1.readline()
            line2a = gfr2.readline()
            if not line1a or not line2a:
                if line1a or line2a:
                    sys.stderr.write(
                        "WARNING paired file lengths appear different\n")
                break
            line1b = gfr1.readline()
            line2b = gfr2.readline()
            line1c = gfr1.readline()
            line2c = gfr2.readline()
            line1d = gfr1.readline()
            line2d = gfr2.readline()
            of1.write(line1a)
            of2.write(line2a)
            of1.write(line1b)
            of2.write(line2b)
            of1.write(line1c)
            of2.write(line2c)
            of1.write(line1d)
            of2.write(line2d)
            fcount += 1
            if args.size <= fcount:
                fcount = 0
                out_iter += 1
                of1.close()
                of2.close()
                of1 = open(
                    args.output_directory.rstrip('/') + '/' + str(out_iter) +
                    '_1.fq', 'w')
                of2 = open(
                    args.output_directory.rstrip('/') + '/' + str(out_iter) +
                    '_2.fq', 'w')
        gfr1.close()
        gfr2.close()
def main():
    parser = argparse.ArgumentParser(
        description='Use reference junctions when they are close',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT min intron size")
    parser.add_argument(
        '--min_local_support',
        type=int,
        default=0,
        help=
        "INT min number of junctions within search_size of a junction in order to count it"
    )
    parser.add_argument('--search_size',
                        type=int,
                        default=10,
                        help="INT search space for reference")
    parser.add_argument(
        '--output_fake_psl',
        help="FASTAFILE reference genome to make a fake PSL output")
    parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN")
    parser.add_argument('reference_genepred',
                        help="FASTAFILENAME for reference genepred")
    args = parser.parse_args()

    cpus = multiprocessing.cpu_count()

    genome = {}
    if args.output_fake_psl:
        genome = read_fasta_into_hash(args.output_fake_psl)

    #read in the reference genepred first
    gpf = GenePredBasics.GenePredFile(args.reference_genepred)
    #lets sort entries by chromosome
    ref = {}
    for e in [x.entry for x in gpf.entries]:
        if len(e['exonStarts']) <= 1: continue
        if e['chrom'] not in ref:
            ref[e['chrom']] = {}
        for i in range(1, len(e['exonStarts'])):
            if e['exonEnds'][i - 1] not in ref[e['chrom']]:
                ref[e['chrom']][e['exonEnds'][i - 1]] = {}
            if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i -
                                                                           1]]:
                ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] +
                                                      1] = e['strand']
    #Stored all junctions as 1-base

    read_info = {}
    pf = GenericFileReader(args.psl)
    fcount_total = 0
    while True:
        line = pf.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        pe = PSLBasics.line_to_entry(line)
        if len(pe['tStarts']) != len(pe['blockSizes']) or len(
                pe['qStarts']) != len(pe['blockSizes']):
            sys.stderr.write("WARNING invalid psl\n")
            continue
        genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
        ge = GenePredBasics.smooth_gaps(
            GenePredBasics.line_to_entry(genepred_line), args.min_intron_size)
        refjuns = {}
        if pe['tName'] in ref: refjuns = ref[pe['tName']]
        new_ge = nudge(pe, ge, refjuns, args)
        if args.output_fake_psl:
            new_psl_line = GenePredBasics.entry_to_fake_psl_line(
                new_ge, genome)
            print new_psl_line
        else:
            print GenePredBasics.entry_to_line(new_ge)