Example #1
0
def main():
    """Parse command line args, and call appropriate functions."""
    usage = """\nusage: %prog [options]\n"""
    parser = optparse.OptionParser(usage=usage)
    #Other option types are int and float, string is default.
    #Note there is also a default parameter.
    parser.add_option('-d', '--dir', dest="hmm_fit_dir", type="string")
    parser.add_option('-o', '--out', dest="out_path", type="string")
    parser.add_option('-t',
                      '--thresh',
                      dest="pnathresh",
                      type="float",
                      default=.03)
    opts, args = parser.parse_args(
    )  #Args taken from sys.argv[1:] by default, parsed using GNU/POSIX syntax.
    if not opts.hmm_fit_dir and opts.out_path:
        parser.error(
            "A directory for locating hmm_fit data and output file path is required."
        )

    print "Starting hmmprob_to_est.py with parameters:", str(opts)
    print "Free memory is %s MB" % get_free_memory()
    all_files = grab_files(opts.hmm_fit_dir)
    print "Found %s files" % len(all_files)
    d_ests = transform(all_files, opts.pnathresh)
    print "Free memory now is %s MB" % get_free_memory()
    write_csv(d_ests, opts.out_path)
Example #2
0
def write_csv(d_ests, out_path):
    """ Takes a (filtered) dict (See transform function for explanation of
    what d_ests is and an example of what it could contain.)
    
    Writes it out into a CSV file, putting everything together in one matrix.    
    """
    #Write to CSV
    outfile = open(out_path, 'wb')
    outcsv = csv.writer(outfile)

    #Set up data to fill in, and be written to file
    #Header rows:
    header_row, chrom_row, gen_map_pos_row = ['individual'],[''],['']
    #seed with ind names (Make first column of each data row hold the ind name.)
    csv_data = []
    for d_inds in d_ests.values():
        for ind_name in d_inds:
            csv_data.append((ind_name,))
    #Sort and make sure there are no duplicates
    csv_data = sorted(list(set(csv_data)))
    #change rows from tuples to lists
    csv_data = [list(row) for row in csv_data]
    #print csv_data
    #build an index of our csv_data so we can quickly put an indivudals data in the right place
    r = row_by_ind_name = {}
    for i,row in enumerate(csv_data):
        r[row[0]] = i

    #fill in data
    for chrom, d_inds in d_ests.items():
        #Make a sorted list of all positions in this chromosome
        all_positions = set()
        for d_ests in d_inds.values():
            all_positions |= set(d_ests.keys())
        all_positions = sorted(list(all_positions))
        
        if all_positions: #only include chroms with data
            #Update header rows with these positions / chomosomes
            header_row += ['%s-%s' % (chrom, v) for v in all_positions]
            chrom_row += ([chrom] * len(all_positions))
            gen_map_pos_row += [i+1 for i in range(len(all_positions))]

            #Store actual data to be written
            for ind_name, ests_by_pos in d_inds.items():
                outrow = csv_data[r[ind_name]]
                for pos in all_positions:
                    outrow.append(ests_by_pos.get(pos,'-'))
    
    outcsv.writerow(header_row)
    outcsv.writerow(chrom_row)
    outcsv.writerow(gen_map_pos_row)
    outcsv.writerows(csv_data)
    print "Free memory after writing CSV is %s MB" % get_free_memory()
    outfile.close()
Example #3
0
def main():
    """Parse command line args, and call appropriate functions."""
    usage="""\nusage: %prog [options]\n"""
    parser = optparse.OptionParser(usage=usage)
    #Other option types are int and float, string is default.
    #Note there is also a default parameter.
    parser.add_option('-d','--dir',dest="hmm_fit_dir",type="string")
    parser.add_option('-o','--out',dest="out_path",type="string")
    parser.add_option('-t','--thresh',dest="pnathresh",type="float",default=.03)
    opts,args=parser.parse_args() #Args taken from sys.argv[1:] by default, parsed using GNU/POSIX syntax.
    if not opts.hmm_fit_dir and opts.out_path:
        parser.error("A directory for locating hmm_fit data and output file path is required.")

    print "Starting hmmprob_to_est.py with parameters:", str(opts)
    print "Free memory is %s MB" % get_free_memory()
    all_files = grab_files(opts.hmm_fit_dir)
    print "Found %s files" % len(all_files)
    d_ests = transform(all_files, opts.pnathresh)
    print "Free memory now is %s MB" % get_free_memory()
    write_csv(d_ests, opts.out_path)
Example #4
0
def write_csv(d_ests, out_path):
    """ Takes a (filtered) dict (See transform function for explanation of
    what d_ests is and an example of what it could contain.)
    
    Writes it out into a CSV file, putting everything together in one matrix.    
    """
    #Write to CSV
    outfile = open(out_path, 'wb')
    outcsv = csv.writer(outfile)

    #Set up data to fill in, and be written to file
    #Header rows:
    header_row, chrom_row, gen_map_pos_row = ['individual'], [''], ['']
    #seed with ind names (Make first column of each data row hold the ind name.)
    csv_data = []
    for d_inds in d_ests.values():
        for ind_name in d_inds:
            csv_data.append((ind_name, ))
    #Sort and make sure there are no duplicates
    csv_data = sorted(list(set(csv_data)))
    #change rows from tuples to lists
    csv_data = [list(row) for row in csv_data]
    #print csv_data
    #build an index of our csv_data so we can quickly put an indivudals data in the right place
    r = row_by_ind_name = {}
    for i, row in enumerate(csv_data):
        r[row[0]] = i

    #fill in data
    for chrom, d_inds in d_ests.items():
        #Make a sorted list of all positions in this chromosome
        all_positions = set()
        for d_ests in d_inds.values():
            all_positions |= set(d_ests.keys())
        all_positions = sorted(list(all_positions))

        if all_positions:  #only include chroms with data
            #Update header rows with these positions / chomosomes
            header_row += ['%s-%s' % (chrom, v) for v in all_positions]
            chrom_row += ([chrom] * len(all_positions))
            gen_map_pos_row += [i + 1 for i in range(len(all_positions))]

            #Store actual data to be written
            for ind_name, ests_by_pos in d_inds.items():
                outrow = csv_data[r[ind_name]]
                for pos in all_positions:
                    outrow.append(ests_by_pos.get(pos, '-'))

    outcsv.writerow(header_row)
    outcsv.writerow(chrom_row)
    outcsv.writerow(gen_map_pos_row)
    outcsv.writerows(csv_data)
    print "Free memory after writing CSV is %s MB" % get_free_memory()
    outfile.close()
Example #5
0
def transform(file_list, pnathresh):
    """
    Groups position ests by individual and by chromosome and filters 
    out positions with less than pnathresh % coverage.
    """

    #d_ests stores estimates by position by individual by chromosome.
    #example:
    #    {'2R': {'indivA12_AATAAG': {'1000992': '1',
    #                                '10065531': '3',
    #                                '9987712': '1'},
    #            'indivE12_GTATCG': {'10002269': '3',
    #                                '10022498': '3',
    #                                '10079005': '3'},
    #                                },
    #     '3R': ...
    #        }
    d_ests = {}
    chrom_pos_count = {}  #count of individuals with a given (chrom,position)

    #Fill up data structure from all files
    for path in file_list:
        ind_name, chrom = parse_path(path)
        if not chrom in d_ests:
            d_ests[chrom] = {}
        if not ind_name in d_ests[chrom]:
            d_ests[chrom][ind_name] = {}
        csv_reader = csv.reader(open(path, 'rb'))
        csv_reader.next()  #skip header row
        for row in csv_reader:
            pos, count, est = row[COL_POS], row[COL_COUNT], row[COL_EST]
            d_ests[chrom][ind_name][pos] = est
            chrom_pos_count[(chrom, pos)] = chrom_pos_count.get(
                (chrom, pos), 0) + 1

    print "(mid transform function) Free memory now is %s MB" % get_free_memory(
    )

    #Remove positions with less individuals than pna thresh %
    #(example: If pna thresh is .1, that means for a given chromosome location
    #we'd throw out the whole position if it exists for less than 10% of individuals)
    num_inds = max([len(d_inds) for d_inds in d_ests.values()])
    print "There are %s individuals" % num_inds
    count_thresh = int(round(pnathresh * num_inds))
    print "Will throw out chrom/positions with less than %s individuals." % count_thresh
    print "(that's int(round(pna_thresh %s * %s individuals)) = %s )" % (
        pnathresh, num_inds, count_thresh)

    for chrom, d_inds in d_ests.items():
        for ind_name, ests_by_pos in d_inds.items():
            for pos in ests_by_pos.keys():
                if chrom_pos_count[(chrom, pos)] < count_thresh:
                    del d_ests[chrom][ind_name][pos]
    return d_ests
Example #6
0
def transform(file_list, pnathresh):
    """
    Groups position ests by individual and by chromosome and filters 
    out positions with less than pnathresh % coverage.
    """
    
    #d_ests stores estimates by position by individual by chromosome.
    #example:
    #    {'2R': {'indivA12_AATAAG': {'1000992': '1',
    #                                '10065531': '3',
    #                                '9987712': '1'},
    #            'indivE12_GTATCG': {'10002269': '3',
    #                                '10022498': '3',
    #                                '10079005': '3'},
    #                                },
    #     '3R': ...
    #        }
    d_ests = {}
    chrom_pos_count = {} #count of individuals with a given (chrom,position)
    
    #Fill up data structure from all files
    for path in file_list:
        ind_name, chrom = parse_path(path)
        if not chrom in d_ests:
            d_ests[chrom] = {}
        if not ind_name in d_ests[chrom]:
            d_ests[chrom][ind_name] = {}
        csv_reader = csv.reader(open(path, 'rb'))
        csv_reader.next() #skip header row
        for row in csv_reader:
            pos, count, est = row[COL_POS], row[COL_COUNT], row[COL_EST]
            d_ests[chrom][ind_name][pos] = est
            chrom_pos_count[(chrom,pos)] = chrom_pos_count.get((chrom,pos),0) + 1

    print "(mid transform function) Free memory now is %s MB" % get_free_memory()

    #Remove positions with less individuals than pna thresh %
    #(example: If pna thresh is .1, that means for a given chromosome location
    #we'd throw out the whole position if it exists for less than 10% of individuals)
    num_inds = max([len(d_inds) for d_inds in d_ests.values()])
    print "There are %s individuals" % num_inds
    count_thresh = int(round(pnathresh * num_inds))
    print "Will throw out chrom/positions with less than %s individuals." % count_thresh
    print "(that's int(round(pna_thresh %s * %s individuals)) = %s )" % (pnathresh, num_inds, count_thresh)

    for chrom, d_inds in d_ests.items():
        for ind_name, ests_by_pos in d_inds.items():
            for pos in ests_by_pos.keys():
                if chrom_pos_count[(chrom,pos)] < count_thresh:
                    del d_ests[chrom][ind_name][pos]                
    return d_ests
Example #7
0
def main():
    """Parse command line args, and call appropriate functions."""
    # disable garbage collection for a 10% speed boost
    gc.disable()

    usage = """\nusage: %prog [options]\n"""
    parser = optparse.OptionParser(usage=usage)
    # Other option types are int and float, string is default.
    # Note there is also a default parameter.
    parser.add_option("-d", "--dir", dest="hmm_fit_dir", type="string")
    # ?? Need these ??  -c $params{'chroms'} -p $params{'chroms2plot'} -d hmm_fit -t $params{'thinfac'} -f $params{'difffac'} -b $params{'barcodes'} -n $params{'pnathresh'}
    # parser.add_option('-o','--out',dest="out_path",type="string")
    # parser.add_option('-t','--thresh',dest="pnathresh",type="float",default=.03)
    opts, args = parser.parse_args()  # Args taken from sys.argv[1:] by default, parsed using GNU/POSIX syntax.
    if not opts.hmm_fit_dir:
        parser.error("A directory for locating hmm_fit data is required.")

    print "Starting combine.py with parameters:", str(opts)
    print "Free memory is %s MB" % get_free_memory()
    merge(opts.hmm_fit_dir)
Example #8
0
def main():
    """Parse command line args, and call appropriate functions."""
    #disable garbage collection for a 10% speed boost
    gc.disable()

    usage = """\nusage: %prog [options]\n"""
    parser = optparse.OptionParser(usage=usage)
    #Other option types are int and float, string is default.
    #Note there is also a default parameter.
    parser.add_option('-d', '--dir', dest="hmm_fit_dir", type="string")
    #?? Need these ??  -c $params{'chroms'} -p $params{'chroms2plot'} -d hmm_fit -t $params{'thinfac'} -f $params{'difffac'} -b $params{'barcodes'} -n $params{'pnathresh'}
    #parser.add_option('-o','--out',dest="out_path",type="string")
    #parser.add_option('-t','--thresh',dest="pnathresh",type="float",default=.03)
    opts, args = parser.parse_args(
    )  #Args taken from sys.argv[1:] by default, parsed using GNU/POSIX syntax.
    if not opts.hmm_fit_dir:
        parser.error("A directory for locating hmm_fit data is required.")

    print "Starting combine.py with parameters:", str(opts)
    print "Free memory is %s MB" % get_free_memory()
    merge(opts.hmm_fit_dir)
Example #9
0
def merge(dir):
    """
    Combine all individuals and datapoints with one row per individual, with columns
    being chrom:position.  Interpolate missing values in some cases.  (The R code
    that we're trying to replicate was funny with this so there are a few special cases,
    see code)
    Write out one tsv file for each parent.
    """

    # Combine all individuals/positions into a big dictionary (think of it like a sparse table)
    # for each parent
    dp1, dp2 = {}, {}
    for (array, ind, chrom) in input_data_sets(dir):
        print ind, chrom, len(array), "records"
        for x in array:
            key = (ind, chrom, int(x["pos"]))
            dp1[key] = x["par1"]
            dp2[key] = x["par2"]

    gc.collect()
    print "Done loading rdata files."
    print "Free memory is %s MB" % get_free_memory()

    # write out to files and interpolate as we go.  The R code we're replacing had some weird special cases so look out for those.
    for (fname, dp) in (("ancestry-probs-par1.tsv", dp1), ("ancestry-probs-par2.tsv", dp2)):
        if DEBUG:
            fname = "test." + fname
        print "Compiling data for file", fname
        # Get all positions (chrom,pos) sorted by chrom, then by position
        positions = sorted(set([(k[1], k[2]) for k in dp.keys()]))
        header = [""] + ["".join((p[0], ":", str(p[1]))) for p in positions]
        # Get all individuals, sorted
        inds = sorted(set([k[0] for k in dp.keys()]))
        # Build up each row to be written to the file (all individuals x all positions)
        outrows = []
        for ind in inds:
            print "    ", ind
            # initialize/clear out bookkeeping variables
            last_pos_w_val, last_val, last_chrom, to_interpolate = None, None, None, []

            outrow = [ind]  # first column is individual name
            for (chrom, pos) in positions:

                # Handle switching to new chromosome
                if chrom != last_chrom:
                    # set any positions waiting for interpolation to 0 since we've reached the end of the chrom
                    # however we wan't to leave as NA and not interpolate between last_pos_w_val and end of chrom
                    # because that's what R did.
                    for (update_pos, insert_loc) in to_interpolate:
                        if update_pos < last_pos_w_val:
                            outrow[insert_loc] = "0"
                    # clear out bookkeeping vars on new chrom
                    last_pos_w_val, last_val, last_chrom, to_interpolate = None, None, None, []

                key = (ind, chrom, pos)
                if (key in dp) and ((dp[key] > 0.0000005) or (last_val and last_val > 0.0000005)):
                    # This condition is checking if A. data exists for this position and it's non-zero OR B. data exists and the last value seen was non-zero.
                    # These are cases were we want to use this value and last seen value to interpolate positions in the interpolation queue.

                    # Store value in outrow to be written to file
                    outrow.append("%.6f" % round(dp[key], 6))
                    # interpolate any positions waiting for a new value
                    for (update_pos, insert_loc) in to_interpolate:
                        if update_pos < last_pos_w_val:
                            outrow[
                                insert_loc
                            ] = (
                                "0"
                            )  # zero out any pending positions before the last value we saw since this is what R did.
                        else:
                            insert_val = last_val + (
                                (dp[key] - last_val) * (float(update_pos - last_pos_w_val) / (pos - last_pos_w_val))
                            )
                            outrow[insert_loc] = "%.6f" % round(insert_val, 6)
                    to_interpolate = []  # since all pending positions have been interpolated, clear this out
                    last_pos_w_val, last_val = pos, dp[key]
                elif last_val and not (key in dp):
                    # If a value has been seen for this chrom, we'll want to start interpolating
                    # Add a placeholder to outrow
                    outrow.append("NA")  #
                    # Mark position for later interpolation
                    to_interpolate.append((pos, len(outrow) - 1))
                else:
                    # don't interpolate
                    if key in dp:
                        # data exists for key but it's 0, Store value in outrow, but update bookkeeping vars
                        outrow.append("%.6f" % round(dp[key], 6))  # should be 0
                        # still count 0 as a last value for interpolation
                        last_pos_w_val, last_val = pos, dp[key]
                    else:
                        outrow.append("NA")
                last_chrom = chrom

            # set any positions waiting for interpolation to 0 since we've reached the end of the individual
            # however we wan't to leave as NA and not interpolate between last_pos_w_val and end
            # because that's what R did.
            for (update_pos, insert_loc) in to_interpolate:
                if update_pos < last_pos_w_val:
                    outrow[insert_loc] = "0"

            outrows.append(outrow)

        fix_values(outrows)

        print "Writing file", fname
        csvout = csv.writer(open(fname, "wb"), delimiter="\t", quoting=csv.QUOTE_MINIMAL)
        csvout.writerow(header)
        csvout.writerows(outrows)
        gc.collect()
Example #10
0
def merge(dir):
    """
    Combine all individuals and datapoints with one row per individual, with columns
    being chrom:position.  Interpolate missing values in some cases.  (The R code
    that we're trying to replicate was funny with this so there are a few special cases,
    see code)
    Write out one tsv file for each parent.
    """

    #Combine all individuals/positions into a big dictionary (think of it like a sparse table)
    #for each parent
    dp1, dp2 = {}, {}
    for (array, ind, chrom) in input_data_sets(dir):
        print ind, chrom, len(array), "records"
        for x in array:
            key = (ind, chrom, int(x['pos']))
            dp1[key] = x['par1']
            dp2[key] = x['par2']

    gc.collect()
    print "Done loading rdata files."
    print "Free memory is %s MB" % get_free_memory()

    #write out to files and interpolate as we go.  The R code we're replacing had some weird special cases so look out for those.
    for (fname, dp) in (('ancestry-probs-par1.tsv', dp1),
                        ('ancestry-probs-par2.tsv', dp2)):
        if DEBUG:
            fname = 'test.' + fname
        print "Compiling data for file", fname
        #Get all positions (chrom,pos) sorted by chrom, then by position
        positions = sorted(set([(k[1], k[2]) for k in dp.keys()]))
        header = [''] + [''.join((p[0], ':', str(p[1]))) for p in positions]
        #Get all individuals, sorted
        inds = sorted(set([k[0] for k in dp.keys()]))
        #Build up each row to be written to the file (all individuals x all positions)
        outrows = []
        for ind in inds:
            print "    ", ind
            #initialize/clear out bookkeeping variables
            last_pos_w_val, last_val, last_chrom, to_interpolate = None, None, None, []

            outrow = [ind]  #first column is individual name
            for (chrom, pos) in positions:

                # Handle switching to new chromosome
                if chrom != last_chrom:
                    #set any positions waiting for interpolation to 0 since we've reached the end of the chrom
                    #however we wan't to leave as NA and not interpolate between last_pos_w_val and end of chrom
                    #because that's what R did.
                    for (update_pos, insert_loc) in to_interpolate:
                        if update_pos < last_pos_w_val:
                            outrow[insert_loc] = "0"
                    #clear out bookkeeping vars on new chrom
                    last_pos_w_val, last_val, last_chrom, to_interpolate = None, None, None, []

                key = (ind, chrom, pos)
                if (key in dp) and ((dp[key] > .0000005) or
                                    (last_val and last_val > .0000005)):
                    # This condition is checking if A. data exists for this position and it's non-zero OR B. data exists and the last value seen was non-zero.
                    # These are cases were we want to use this value and last seen value to interpolate positions in the interpolation queue.

                    # Store value in outrow to be written to file
                    outrow.append("%.6f" % round(dp[key], 6))
                    #interpolate any positions waiting for a new value
                    for (update_pos, insert_loc) in to_interpolate:
                        if update_pos < last_pos_w_val:
                            outrow[
                                insert_loc] = "0"  # zero out any pending positions before the last value we saw since this is what R did.
                        else:
                            insert_val = last_val + (
                                (dp[key] - last_val) *
                                (float(update_pos - last_pos_w_val) /
                                 (pos - last_pos_w_val)))
                            outrow[insert_loc] = "%.6f" % round(insert_val, 6)
                    to_interpolate = [
                    ]  #since all pending positions have been interpolated, clear this out
                    last_pos_w_val, last_val = pos, dp[key]
                elif last_val and not (key in dp):
                    #If a value has been seen for this chrom, we'll want to start interpolating
                    #Add a placeholder to outrow
                    outrow.append('NA')  #
                    #Mark position for later interpolation
                    to_interpolate.append((pos, len(outrow) - 1))
                else:
                    #don't interpolate
                    if key in dp:
                        #data exists for key but it's 0, Store value in outrow, but update bookkeeping vars
                        outrow.append("%.6f" % round(dp[key], 6))  #should be 0
                        #still count 0 as a last value for interpolation
                        last_pos_w_val, last_val = pos, dp[key]
                    else:
                        outrow.append('NA')
                last_chrom = chrom

            #set any positions waiting for interpolation to 0 since we've reached the end of the individual
            #however we wan't to leave as NA and not interpolate between last_pos_w_val and end
            #because that's what R did.
            for (update_pos, insert_loc) in to_interpolate:
                if update_pos < last_pos_w_val:
                    outrow[insert_loc] = "0"

            outrows.append(outrow)

        fix_values(outrows)

        print "Writing file", fname
        csvout = csv.writer(open(fname, 'wb'),
                            delimiter='\t',
                            quoting=csv.QUOTE_MINIMAL)
        csvout.writerow(header)
        csvout.writerows(outrows)
        gc.collect()