Beispiel #1
0
def main(argv):
    if len(argv) != 4:
        raise Exception(
            "Wrong number of arguments. Usage: input output buildID-file")
    inPath = argv[1]
    outPath = argv[2]
    buildIdPath = argv[3]

    build_number = None

    inFile = open(inPath, "rb")
    try:
        table = tsv.TsvReader(inFile, TABLE_SCHEMA)
    except csv.Error:
        print "Warning: ragged table. Assuming excel_tab and correcting"
        inFile.close()
        inFile = open(inPath, "rb")
        outPath = inPath + ".fixed_raggedness"
        outFile = open(outPath, "wb")
        tsv.fixRaggedTable(inFile, outFile, csv.excel_tab)
        outFile.flush()
        outFile.close()
        del outFile
        inFile = open(outPath)
        try:
            table = tsv.TsvReader(inFile, TABLE_SCHEMA)
        except:
            print "Well, that didn't work"
            raise
    # end handler for ragged table

    line = 0
    output = open(outPath, "w")
    for record in table:
        line += 1
        bNum = record["NCBI_Build"]
        if (build_number is not None) and (bNum != build_number):
            raise Exception(
                "Inconsistent NCBI_Build values (prev: %s; current: %s); cleave table first"
                % (str(build_number), str(bNum)))
        build_number = bNum

        # TSV rows can't be written into, only read. thus...
        writableRecord = [cell for cell in record]
        writableRecord[LINENUM_INDEX] = "line" + str(line)

        output.write("\t".join(writableRecord))
        output.write("\n")

    output.flush()
    output.close()
    del output

    buildIdOut = open(buildIdPath, "w")
    buildIdOut.write(NCBI_BUILD_LUT[build_number])
    buildIdOut.flush()
    buildIdOut.close()
    del buildIdOut
    return 0
    def __init__(self, assembly, contig, start, end):
        """
        Given a range on a contig, get all the repeats overlapping that range.
        
        Keeps an IntervalTree of element names, and a Counter from element
        name to number of that element in the range.
        
        No protection against SQL injection.
        
        """

        # Make the interval tree
        self.tree = IntervalTree()

        # Make a counter for repeats with a certain name
        self.counts = collections.Counter()

        command = [
            "hgsql", "-e", "select repName, genoName, genoStart, genoEnd "
            "from {}.rmsk where genoName = '{}' and genoStart > '{}' "
            "and genoEnd < '{}';".format(assembly, contig, start, end)
        ]
        process = subprocess.Popen(command, stdout=subprocess.PIPE)

        for parts in itertools.islice(tsv.TsvReader(process.stdout), 1, None):
            # For each line except the first, broken into fields

            # Add the item to the tree covering its range. Store the repeat type
            # name as the interval's data.
            self.tree.addi(int(parts[2]), int(parts[3]), parts[0])

            # Count it
            self.counts[parts[0]] += 1
Beispiel #3
0
def get_max_f_score(job, gam_key, condition, options):
    """
    Given the GAM file key for a sample that has already had vcfeval run under
    the given conditions, parse the vcfeval roc and return the biggest F score.
    
    """
    
    # Make the IOStore
    cache_store = IOStore.get(options.cache)
    
    # Find the ROC curve
    roc_key = vcfeval_roc_key(gam_key, condition)
    
    # Get the file
    roc_compressed = cache_store.get_input_file(job, roc_key)
    
    # Read it
    reader = tsv.TsvReader(gzip.GzipFile(roc_compressed))
    
    # What's the max F score we found?
    max_f_score = None
    for parts in reader:
        # Parse all the F scores
        f_score = float(parts[6])
        
        if max_f_score is None or f_score > max_f_score:
            # And keep the max
            max_f_score = f_score
            
    # Return the max F score.
    return max_f_score
def read_bootstraps(root_path):
    quant_bootstraps = tsv.TsvReader(open(root_path + "quant_bootstraps.tsv"))
    count = 0
    quant_boot = []
    for parts in quant_bootstraps:
        quant_boot.append(parts)

    df_quant_boot = pd.DataFrame.from_records(quant_boot[1:],
                                              columns=quant_boot[0])
    id_qb = list(df_quant_boot.columns)
    return df_quant_boot, id_qb
Beispiel #5
0
def generateTxtFromMetas(pathToEmbeddings, pathToMeta, outputPath):
    reader = tsv.TsvReader(open(pathToEmbeddings))
    meta = tsv.TsvReader(open(pathToMeta))
    chara = []
    label = []

    for zi, line in meta:
        label.append(zi)

    label = label[1:]
    # print(len(label))
    with open(outputPath, 'w') as f:
        for count, embedding in enumerate(reader):
            em = list(embedding)
            if count == 0:
                size = len(em)
                f.write(str(len(label)) + " " + str(size) + "\n")
            data = " ".join(em)
            if (count < len(label)):
                f.write(label[count] + " " + data + "\n")
def getRegions(metadata_url):
    """
    Download the assembly metadata file at the given URL, and return a dict from
    upper-case region names to 0-based end-exclusive (contig, start, end)
    tuples. Contig names start with "chr".
    
    """

    # Holds the chromosome number for each region?
    region_chromosomes = {}
    # Holds the minimum start position for each region on its chromosome
    region_starts = collections.defaultdict(lambda: float("inf"))
    # Holds the maximum stop position for each region on its chromosome
    region_stops = collections.defaultdict(lambda: float("-inf"))

    # Holds the (contig, start, end) tuple for each alt in a given region.
    ranges_by_region = collections.defaultdict(list)

    # Hard-code some regions that aren't real alt regions
    ranges_by_region["BRCA1"] = ("chr17", 43044294, 43125482)
    ranges_by_region["BRCA2"] = ("chr13", 32314861, 32399849)
    ranges_by_region["CENX"] = ("chrX", 58605580, 62412542)

    # Read the reference database
    database = tsv.TsvReader(urllib2.urlopen(metadata_url))

    for parts in database:
        # Parse out all the info for this alt and its parent chromosome
        region_name = parts[7]
        # Grab the chromosome ("1" or "X") that's the parent
        parent_chromosome = parts[5]
        parent_start = int(parts[11])
        parent_stop = int(parts[12])
        alt_contig = parts[3]
        alt_start = int(parts[9])
        alt_stop = int(parts[10])

        # Note the region start, stop, and parent chromosome number
        region_chromosomes[region_name] = parent_chromosome
        region_starts[region_name] = min(region_starts[region_name],
                                         parent_start)
        region_stops[region_name] = max(region_stops[region_name], parent_stop)

    for region_name in region_chromosomes.iterkeys():
        # Add in the reference ranges that all the alts are alternatives to
        # Make sure to add the chr prefix.
        ranges_by_region[region_name] = ("chr" +
                                         region_chromosomes[region_name],
                                         region_starts[region_name],
                                         region_stops[region_name])

    # Give back our region info dict
    return ranges_by_region
Beispiel #7
0
def main():

    # Where do we put them?
    DATABASE_PATH = 'Database'

    # This holds total price by keyword, for all observed keywords
    total_by_keyword = collections.defaultdict(float)
    occurrences = collections.Counter()

    # This holds an example for each keyword
    examples = dict()

    for root, dirs, files in os.walk(DATABASE_PATH):
        for filename in files:
            if filename.endswith('.tsv'):
                # We found a TSV

                with open(os.path.join(root, filename)) as tsv_in:
                    # Read the TSV
                    reader = tsv.TsvReader(tsv_in)

                    for item, price in reader:
                        # For each recorded item
                        # Parse the price
                        price = float(price)

                        if math.isnan(price):
                            # Skip unpriceable items
                            continue

                        # Compute all unique keywords in the item
                        keywords = set(item.upper().split())

                        for keyword in keywords:
                            # The price contributes to every keyword
                            total_by_keyword[keyword] += price
                            # We count the occurrences
                            occurrences[keyword] += 1
                            if keyword not in examples or random.random(
                            ) < 0.5:
                                # This ought to be our example for this keyword
                                examples[keyword] = item.upper()

    # Make a big table
    keywords_with_totals = list(total_by_keyword.items())
    # Sort by total cost, descending
    keywords_with_totals.sort(key=operator.itemgetter(1), reverse=True)

    print("=== Top 10 Expensive Grocery Keywords ===")
    for i, (keyword, cost) in enumerate(keywords_with_totals[:10]):
        print("#{}:\t${}\t{} (x{}, e.g. \"{}\")".format(
            i + 1, cost, keyword, occurrences[keyword], examples[keyword]))
Beispiel #8
0
def url_open_tsv(url):
    """
    Open a TSV URL and loop through the lines as lists.
    
    """

    try:
        reader = tsv.TsvReader(urllib2.urlopen(url))
    except urllib2.URLError as err:
        print("Could not open " + url)
        raise err

    return reader
Beispiel #9
0
 def parse_name_stream(self, stream):
     """
     Parse GRC chr2acc format (TSV of name and accession.version) on the
     given input stream, and make the appropriate primary scaffold name
     assignments.
     """
     
     # Make a TSV reader
     reader = tsv.TsvReader(stream)
     
     for name, accession in reader:
         # Apply each name/accession mapping
         self.set_chromosome_name(accession, name)
Beispiel #10
0
 def parse_placement_stream(self, stream):
     """
     Parse GRC alt_scaffold_placement.txt format (TSV with alt
     accession.version on column 4 and parent accession.version on column 7)
     on the given input stream, and make the appropriate parent assignments.
     """
     
     # Make a TSV reader
     reader = tsv.TsvReader(stream)
     
     for parts in reader:
         # Look at each non-comment line
         
         if len(parts) < 7:
             # We can't pull out the parent
             raise RuntimeError(
                 "Insufficient columns in alt scaffold location data")
         
         # Make the alt (1-based column 4) a child of the parent (1-based
         # column 7)
         self.set_alt_parent(parts[3], parts[6])
def main(args):
    """
    Parses command line arguments and do the work of the program.
    "args" specifies the program arguments, with args[0] being the executable
    name. The return value should be used as the program's exit code.
    """

    if len(args) == 2 and args[1] == "--test":
        # Run the tests
        return doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)

    options = parse_args(args)  # This holds the nicely-parsed options object

    # Load the sample whitelist, if applicable. Holds a set if we have a
    # whitelist, or None otherwise.
    sample_whitelist = None
    if options.samples is not None:
        # Read all the samples from the file
        sample_whitelist = set(
            [line[0] for line in tsv.TsvReader(options.samples)])

    RealTimeLogger.start_master()

    # Make a root job
    root_job = Job.wrapJobFn(scan_all,
                             options,
                             sample_whitelist,
                             cores=1,
                             memory="1G",
                             disk="1G")

    # Run it and see how many jobs fail
    failed_jobs = Job.Runner.startToil(root_job, options)

    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))

    print("All jobs completed successfully")

    RealTimeLogger.stop_master()
Beispiel #12
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: celdel

created to have a nice and clean encods_1.csv with all encods of all the img
"""

import tsv
import pandas as pd
import csv

reader = tsv.TsvReader(open("encods.tsv"))
new_list = []
for parts in reader:
    new_list.append(list(parts))

df = pd.DataFrame(new_list)
df.to_csv("encods_1.csv")
Beispiel #13
0
def process_raw_data(raw_data, old_html_dir, options):
    """
    This function receives the file containing raw genomic data that the user
    wants to map to the pre-existing visulization & the location of
    pre-existing visualization files. We will parse this new data file
    placing the rows in an order defined by the genes tab from the pre-existing
    visualization. This way we generate a mutable numpy matrix of raw patient
    data and have the genes in the required by the transform matrix,
    U^T, & S matrices.
    """
    # Create the file paths for the required files
    genes_file_loc = os.path.join(old_html_dir, "genes.tab")
    s_matrix_file_loc = os.path.join(old_html_dir, "S.tab")
    u_t_matrix_file_loc = os.path.join(old_html_dir, "U_T.tab")
    beta_matrix_file_loc = os.path.join(old_html_dir, "beta.tab")
    assignments_file_loc = os.path.join(old_html_dir, "assignments0.tab")

    # First open the genes file.
    genes_reader = tsv.TsvReader(open(genes_file_loc, 'r'))

    # This holds an iterator over lines in that file
    genes_iterator = genes_reader.__iter__()

    # Extract data type of the pre-existing visualization & the list of genes
    old_data_type = genes_iterator.next()
    print("Previous Data Type", old_data_type)

    # First see of the new data and the old data are of compatible data types
    new_data_type = options.type
    old_genes_list = []
    # If they are the same data type add the genes to a python list
    if old_data_type[0] == new_data_type:
        print("Same Data Types")
        old_genes_list = genes_iterator.next()
        genes_reader.close()

        # First open the raw data file.
        raw_data_reader = tsv.TsvReader(open(raw_data, 'r'))
        # This holds an iterator over lines in that file
        raw_data_iterator = raw_data_reader.__iter__()

        sample_names = raw_data_iterator.next()
        sample_names = sample_names[1:]
        num_samples = len(sample_names)
        new_genes_list = []
        for row in raw_data_iterator:
            new_gene = row[0]
            new_genes_list.append(new_gene)
        raw_data_reader.close()

        # Get the number of new samples & number of old genes to create
        # a new numpy data matrix
        print("Number of New Samples:", num_samples)
        num_new_genes = len(new_genes_list)
        print("Number of New genes:", num_new_genes)

        # Re-Initialize the data iterator
        # This holds an iterator over lines in that file
        raw_data_reader = tsv.TsvReader(open(raw_data, 'r'))
        raw_data_iterator = raw_data_reader.__iter__()
        # Skip the first line which is simple a row of headers
        raw_data_iterator.next()

        # Next we have to dump all the valus from the file into a numpy matrix
        # The values will be unsorted. We will then have to sort the rows of the
        # numpy matrix according to the order prescribed by old_genes_list
        raw_data_matrix_unsorted = numpy.zeros(shape=(num_new_genes,
                                                      num_samples))
        for rindex, row in enumerate(raw_data_iterator):
            # Cut off the first value of each row. It is simply the gene name.
            only_values = row[1:]
            # Place the data from only_values into the appropriate row in
            # raw_data_matrix.
            for cindex, col in enumerate(only_values):
                raw_data_matrix_unsorted[rindex][cindex] = only_values[cindex]

        # For every gene in old_genes_list search the new_genes_list for the
        # the appropriate index. Then use this index to find the values in
        # the unsorted data matrix and copy them a new sorted matrix.
        # This new matrix will be used the compute the (x,y) coordinates
        # needed to map the new samples.
        num_old_genes = len(old_genes_list)

        #Debugging
        num_no_data = 0

        raw_data_matrix_sorted = numpy.zeros(shape=(num_old_genes,
                                                    num_samples))
        for rindex, gene in enumerate(old_genes_list):
            # Find the index of the desired gene in the new_genes_list
            # This index will corrrespond to the row in the raw_data_matrix_unsorted
            # that we want to extract and place in the raw_data_matrix_sorted
            try:
                gene_index = new_genes_list.index(gene)
                extracted_data_row = raw_data_matrix_unsorted[gene_index]
                # Iterate over the extracted row to place the values in the appropriate row
                # of the sorted data matrix.
                for cindex, col in enumerate(extracted_data_row):
                    raw_data_matrix_sorted[rindex][
                        cindex] = extracted_data_row[cindex]
            except ValueError:
                num_no_data += 1
        print("Number of genes with no data", num_no_data)

        # Open up S matrix, U^T, and Betas for x,y coordinate computation
        # First open the matrix file.
        s_reader = tsv.TsvReader(open(s_matrix_file_loc, 'r'))
        u_t_reader = tsv.TsvReader(open(u_t_matrix_file_loc, 'r'))
        beta_reader = tsv.TsvReader(open(beta_matrix_file_loc, 'r'))

        # Next create iterators to traverse the files
        s_iterator = s_reader.__iter__()
        u_t_iterator = u_t_reader.__iter__()
        beta_iterator = beta_reader.__iter__()

        # Create an array for s_values & create a diagonal matrix from it
        s_values = s_iterator.next()
        float_s_values = []
        for value in s_values:
            v = float(value)
            float_s_values.append(v)
        s_values = float_s_values

        print("S_values", s_values)
        s_diag = numpy.diag(s_values)
        print(s_diag)

        # Create a numpy matrix for u_t (number of principal components * number of genes)
        u_t = numpy.zeros(shape=(len(s_values), num_old_genes))
        for rindex, row in enumerate(u_t_iterator):
            for cindex, col in enumerate(row):
                u_t[rindex][cindex] = float(row[cindex])

        # Create a numpy matrix for the betas (number of principal components * 2)
        betas = numpy.zeros(shape=(len(s_values), 2))
        for rindex, row in enumerate(beta_iterator):
            for cindex, col in enumerate(row):
                betas[rindex][cindex] = float(row[cindex])
        betas = numpy.transpose(betas)

        # Compute new coordinates
        coords = betas * (numpy.asmatrix(s_diag) * numpy.asmatrix(u_t) *
                          numpy.asmatrix(raw_data_matrix_sorted))
        print("Coordinates")
        print(coords)

        coords = numpy.transpose(coords)
        # Add to existing "assignments.tab" file
        assignments_writer = tsv.TsvWriter(open(assignments_file_loc, 'a'))
        for rindex, sample in enumerate(sample_names):
            print("Cindex", cindex)
            x = str(coords[rindex, 0])
            y = str(coords[rindex, 1])
            print(sample, x, y)
            assignments_writer.line(sample, x, y)

        assignments_writer.close()

    else:
        raise Exception("Pre-existing Visualization employs ", old_data_type,
                        " data. Data to me mapped is of ", new_data_type,
                        ". Data Types must be the same.")

    return True
Beispiel #14
0
#!/usr/bin/env python
"""
A simple example of how to read a TSV file using the 'tsv' module
"""

import tsv

reader = tsv.TsvReader(open("data_samples/file.tsv"))
for parts in reader:
    parts = list(parts)
    # Here parts is a list of strings, one per tab-separated column.
    # Make sure you handle not having enough fields, or not being able to
    # parse numbers where you expect them.
    print("Record with fields: {}".format(parts))
Beispiel #15
0
def downloadAllReads(job, options):
    """
    Download all the reads for the regions.
    
    """

    # Move to the appropriate working directory from wherever Toil dropped us
    os.chdir(options.cwd)

    # Initialize logging
    RealTimeLogger.set_master(options)

    RealTimeLogger.get().info("Starting download")

    # First make the output directory
    if not os.path.exists(options.out_dir):
        try:
            # Make it if it doesn't exist
            os.makedirs(options.out_dir)
        except OSError:
            # If you can't make it, maybe someone else did?
            pass

    # Whatever happens, it needs to exist here
    assert (os.path.exists(options.out_dir) and os.path.isdir(options.out_dir))

    # Holds the chromosome number for each region?
    region_chromosomes = {}
    # Holds the minimum start position for each region on its chromosome
    region_starts = collections.defaultdict(lambda: float("inf"))
    # Holds the maximum stop position for each region on its chromosome
    region_stops = collections.defaultdict(lambda: float("-inf"))

    # Holds the contig:start-end string for each alt in a given region
    # The reference range gets added in last
    ranges_by_region = collections.defaultdict(list)

    # Hard-code some regions that aren't real
    ranges_by_region["BRCA1"] = ["chr17:43044294-43125482"]
    ranges_by_region["BRCA2"] = ["chr13:32314861-32399849"]
    ranges_by_region["CENX"] = ["chrX:58605580-62412542"]

    # Read the reference database
    database = tsv.TsvReader(urllib2.urlopen(options.reference_metadata))

    for parts in database:
        # Parse out all the info for this alt and its parent chromosome
        region_name = parts[7]
        # Grab the chromosome ("1" or "X") that's the parent
        parent_chromosome = parts[5]
        parent_start = int(parts[11])
        parent_stop = int(parts[12])
        alt_contig = parts[3]
        alt_start = int(parts[9])
        alt_stop = int(parts[10])

        # Note the region start, stop, and parent chromosome number
        region_chromosomes[region_name] = parent_chromosome
        region_starts[region_name] = min(region_starts[region_name],
                                         parent_start)
        region_stops[region_name] = max(region_stops[region_name], parent_stop)

        # Turn the alt name into the proper format (GL000251.2 to
        # chr6_GL000251v2_alt)
        name_parts = alt_contig.split(".")
        fixed_alt_contig = "chr{}_{}v{}_alt".format(parent_chromosome,
                                                    name_parts[0],
                                                    name_parts[1])

        # Add it to the list for its region
        ranges_by_region[region_name].append("{}:{}-{}".format(
            fixed_alt_contig, alt_start, alt_stop))

    for region_name in region_chromosomes.iterkeys():
        # Add in the reference ranges that all the alts are alternatives to
        ranges_by_region[region_name].append("chr{}:{}-{}".format(
            region_chromosomes[region_name], region_starts[region_name],
            region_stops[region_name]))

    # Are we using a real FTP URL, or a file URL?

    if urlparse.urlparse(options.sample_ftp_root).scheme == "ftp":
        # It's really FTP
        ftp, root_path = ftp_connect(options.sample_ftp_root)
    else:
        # Assume it's a bare file path
        ftp = FakeFTP(options.sample_ftp_root)
        root_path = ""

    if len(root_path) > 0:
        # Calculate the FTP base URL (without directory). We need it later for
        # turning found index files into URLs.
        base_url = options.sample_ftp_root[:-len(root_path)]
    else:
        # If root_path is empty, we should do nothing, because there's no way to
        # say [:-0].
        base_url = options.sample_ftp_root

    RealTimeLogger.get().info("Sample root: {} Base URL: {}".format(
        options.sample_ftp_root, base_url))

    # Dump the good data files for samples
    good_samples = open("{}/good.txt".format(options.out_dir), "w")

    # Grab all the population names that match the population pattern
    population_names = [
        n for n in ftp.nlst()
        if fnmatch.fnmatchcase(n, options.population_pattern)
    ]

    # TODO: We'll go through them in this order, so if you want a representative
    # subsampling, add some shuffle here or something.

    # This holds URLs to data files (BAM/CRAM) with indexes that are on a
    # sufficient number of contigs, by sample name. We take the first
    # sufficiently good file for any sample.
    sample_file_urls = {}

    for population_name in population_names:

        # For each of those, we need to get samples
        # Go to the population root
        ftp.cwd("{}/{}".format(root_path, population_name))

        # Grab all the sample names that match the sample name pattern.
        # Hopefully there aren't too many.
        sample_names = [
            n for n in ftp.nlst()
            if fnmatch.fnmatchcase(n, options.sample_pattern)
        ]

        # TODO: handle failures during explore_path?
        for sample_name in sample_names:
            # For every sample

            print("Try {}".format(sample_name))

            for data_name in explore_path(
                    ftp, "{}/{}/{}".format(root_path, population_name,
                                           sample_name), options.file_pattern):
                # Find its data files (there may be several)

                # Get the index for each
                index_name = data_name + options.index_suffix

                print(index_name)

                if options.min_indexed_contigs > 0:
                    # We need to run the check on the index before downloading
                    # the sample reads.

                    # Count up the contigs it indexes over
                    indexed_contigs = count_indexed_contigs(
                        "{}/{}".format(base_url, index_name),
                        options.ftp_retry)

                    if indexed_contigs >= options.min_indexed_contigs:
                        # This file for this sample is good enough
                        sample_file_urls[sample_name] = "{}/{}".format(
                            base_url, data_name)

                        RealTimeLogger.get().info(
                            "Sample {} has index of {} contigs".format(
                                sample_name, indexed_contigs))
                        # Add the sample to the file we spit out
                        good_samples.write("{}\n".format(sample_name))

                        # Don't finish exploring the path
                        break

                    else:
                        # Complain
                        RealTimeLogger.get().warning(
                            "Sample {} has index on too few contigs ({})."
                            "Skipping!".format(sample_name, indexed_contigs))

                else:
                    # We don't need to check the number of indexed contigs.
                    RealTimeLogger.get().info(
                        "Sample {} doesn't need an "
                        "indexed contigs check".format(sample_name))
                    # Still use this one. TODO: unify code with above.
                    sample_file_urls[sample_name] = "{}/{}".format(
                        base_url, data_name)
                    # Add the sample to the file we spit out
                    good_samples.write("{}\n".format(sample_name))

            if len(sample_file_urls) >= options.sample_limit:
                # We got enough. Don't finish this population
                break

        if len(sample_file_urls) >= options.sample_limit:
            # We got enough. Don't check more populations
            break

    good_samples.close()

    RealTimeLogger.get().info("Got {} sample URLs".format(
        len(sample_file_urls)))

    if (options.sample_limit < float("inf")):
        # Make sure we got as many as we wanted.
        assert (len(sample_file_urls) == options.sample_limit)

    for region_name in options.regions:
        for sample_name, sample_url in sample_file_urls.iteritems():

            # Make sure the sample directory exists
            sample_dir = "{}/{}/{}".format(options.out_dir, region_name,
                                           sample_name)

            if not os.path.exists(sample_dir):
                try:
                    # Make it if it doesn't exist
                    os.makedirs(sample_dir)
                except OSError:
                    # If you can't make it, maybe someone else did?
                    pass

            assert (os.path.exists(sample_dir) and os.path.isdir(sample_dir))

            # Where will this sample's BAM for this region go?
            bam_filename = "{}/{}.bam".format(sample_dir, sample_name)

            if os.path.exists(bam_filename) and not options.overwrite:
                # Don't re-download stuff we already have.
                RealTimeLogger.get().info("Skipping {} x {} which has already "
                                          "been downloaded".format(
                                              region_name, sample_name))
                continue

            RealTimeLogger.get().info("Making child for {} x {}: {}".format(
                region_name, sample_name, sample_url))

            # Now kick off a job to download all the ranges for the region in
            # parallel for this sample, and then concatenate them together. Tell
            # it to save the results to a file on a shared filesystem.
            job.addChildJobFn(downloadRegion,
                              options,
                              region_name,
                              sample_url,
                              ranges_by_region[region_name],
                              bam_filename,
                              cores=1,
                              memory="1G",
                              disk="4G")

    RealTimeLogger.get().info("Done making children")
Beispiel #16
0
import tsv
import re

reader = tsv.TsvReader(open('karint_corpus.tsv', encoding='utf-8'))

for i in reader:
    # print(' '.join(i))
    msg = list(i)[1]
    if re.search(r'.sozluk.', msg) is not None:
        print(msg)
def scan_region(job, options, region, pop_by_sample, sample_whitelist):
    """
    Scan all the graphs in a region for bias.
    
    If sample_whitelist is not None, ignores samples not in that set.
    
    """

    # Set up the IO stores.
    in_store = IOStore.get(options.in_store)
    out_store = IOStore.get(options.out_store)

    # This holds a dict from graph name, then sample name, then stat name to
    # actual stat value.
    stats_cache = collections.defaultdict(
        lambda: collections.defaultdict(dict))

    # This is the cache file for this region, in
    # <graph>\t<sample>\t<stat>\t<value> format
    cache_tsv_key = "plots/cache/{}.tsv".format(region)

    # What name will it have locally for us?
    local_filename = os.path.join(job.fileStore.getLocalTempDir(), "temp.tsv")

    if out_store.exists(cache_tsv_key):
        # Just read in from that TSV

        RealTimeLogger.get().info("Loading cached region {}".format(region))

        # Grab the cached results
        out_store.read_input_file(cache_tsv_key, local_filename)

        # Read all the pop, value pairs from the TSV
        reader = tsv.TsvReader(open(local_filename))

        # Which samples are going to be skipped?
        skipped_samples = set()

        for graph, sample, stat, value in reader:
            # Read every line from the cache and pull out what value for what
            # stat it gives for what sample.

            if sample_whitelist is not None and sample not in sample_whitelist:
                # Skip this sample that's not on the list
                skipped_samples.add(sample)
                continue

            # Populate our cache dict
            stats_cache[graph][sample][stat] = float(value)

        RealTimeLogger.get().info("Skipped {} samples".format(
            len(skipped_samples)))

    else:
        # Stats haven't been collated
        raise RuntimeError(
            "No graph stats for {}; run collateStatistics.py".format(region))

    # We want normalized and un-normalized versions of the stats cache
    stats_by_mode = {"absolute": stats_cache}

    if stats_cache.has_key("refonly"):

        # Deep copy and normalize the stats cache
        normed_stats_cache = copy.deepcopy(stats_cache)

        # We want to normalize and the reference exists (i.e. not CENX)
        # Normalize every stat against the reference, by subtraction
        for graph, stats_by_sample in normed_stats_cache.iteritems():
            # For each graph and all the stats for that graph
            for sample, stats_by_name in stats_by_sample.iteritems():
                # For each sample and all the stats for that sample
                for stat_name in stats_by_name.keys():

                    if stats_cache["refonly"].has_key(sample):

                        # Get the reference value
                        ref_value = stats_cache["refonly"][sample][stat_name]

                        # Normalize by subtraction
                        stats_by_name[stat_name] -= ref_value

                    else:
                        # Nothing to norm against. TODO: maybe complain when
                        # sample sets aren't all the same?
                        stats_by_name[stat_name] = None

        # Register this as a condition
        stats_by_mode["normalized"] = normed_stats_cache

    # Now save stats, parceling out by region and graph

    for mode, mode_stats_cache in stats_by_mode.iteritems():
        for graph, stats_by_sample in mode_stats_cache.iteritems():
            # We need some config
            # Where should we route each stat to?
            stat_file_keys = {
                "substitution_rate":
                "bias/{}/{}/substrate.{}.tsv".format(mode, region, graph),
                "indel_rate":
                "bias/{}/{}/indelrate.{}.tsv".format(mode, region, graph),
                "portion_perfect":
                "bias/{}/{}/perfect.{}.tsv".format(mode, region, graph)
            }

            # Make a local temp file for each (dict from stat name to file
            # object with a .name).
            stats_file_temps = {
                name: tempfile.NamedTemporaryFile(
                    dir=job.fileStore.getLocalTempDir(), delete=False)
                for name in stat_file_keys.iterkeys()
            }

            for sample, stats_by_name in stats_by_sample.iteritems():
                # For each sample and all the stats for that sample
                for stat_name, stat_value in stats_by_name.iteritems():
                    # For each stat

                    if not stats_file_temps.has_key(stat_name):
                        # Skip stats that have nowhere to go
                        continue

                    # Write graph and value to the file for the stat, for
                    # plotting, naming it after the pop that the sample is in
                    stats_file_temps[stat_name].write("{}\t{}\n".format(
                        pop_by_sample[sample], stat_value))

            for stat_name, stat_file in stats_file_temps.iteritems():
                # Flush and close the temp file
                stat_file.flush()
                os.fsync(stat_file.fileno())
                stat_file.close()

                # Upload the file
                out_store.write_output_file(stat_file.name,
                                            stat_file_keys[stat_name])
def scan_all(job, options, sample_whitelist):
    """
    Scan all the regions and graphs for bias.
    
    Only looks at samples in the whitelist set, if the whitelist is not None.
    
    """

    # Set up the IO stores.
    in_store = IOStore.get(options.in_store)
    out_store = IOStore.get(options.out_store)

    # Download the superpopulation assignments
    # This holds superpop by pop
    superpopulation_by_population = {}

    for parts in tsv.TsvReader(
            urllib2.urlopen(urllib2.Request(options.superpopulation_url))):
        # For each population code (column 1), assign it to the right
        # superpopulation (column 2).
        superpopulation_by_population[parts[1]] = parts[2]

    RealTimeLogger.get().info("Downloading sample population assignments")

    # Load the 1000 Genomes population assignments.
    # Make a reader that goes through split out lines in the TSV.
    reader = tsv_reader_with_comments(
        urllib2.urlopen(urllib2.Request(options.index_url)))

    # Get an iterator over the lines
    lines = iter(reader)

    # Grab the headings
    headings = lines.next()

    while headings[0].startswith("##"):
        # Skip leading lines that aren't the real header (which starts with #)
        headings = lines.next()

    # Which column holds sample names?
    sample_name_column = headings.index("SAMPLE_NAME")

    # Which column holds sample populations?
    sample_population_column = headings.index("POPULATION")

    # What dict do we fill in? Holds population string by sample name.
    # We now use the superpopulation names for our populations.
    pop_by_sample = {}

    # We also want to count samples in each population for debuging
    samples_per_pop = collections.Counter()

    for parts in lines:
        # Save superpopulation under sample
        pop_by_sample[parts[sample_name_column]] = \
            superpopulation_by_population[parts[sample_population_column]]

        # Count the sample for its population
        samples_per_pop[parts[sample_population_column]] += 1

    RealTimeLogger.get().info("Found {} populations:".format(
        len(samples_per_pop)))

    for (pop, count) in samples_per_pop.iteritems():
        RealTimeLogger.get().info("{}: {}".format(pop, count))

    for region in in_store.list_input_directory("stats"):
        # Collate everything in the region
        job.addChildJobFn(scan_region,
                          options,
                          region,
                          pop_by_sample,
                          sample_whitelist,
                          cores=1,
                          memory="1G",
                          disk="10G")
Beispiel #19
0
import math
import tsv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

root_path = "../data/poly_mo/"

# preprocess data files

# Quant_bootstraps.tsv :containing the matrix of bootstrap experiments
# containing the final count for each transcript in each round of bootstrapping
# with a row be a bootstrap output and columns be list of transcripts.
quant_bootstraps = tsv.TsvReader(open(root_path + "quant_bootstraps.tsv"))
count = 0
quant_boot = []
for parts in quant_bootstraps:
    quant_boot.append(parts)
df_quant_boot = pd.DataFrame.from_records(quant_boot[1:],
                                          columns=quant_boot[0])
df_quant_boot = df_quant_boot.astype('float')
df_quant_boot_mean = df_quant_boot.mean()
df_quant_boot_std = df_quant_boot.std()
id_in_quant_boot = list(df_quant_boot.columns)

# given serial distance
distance

for tid in id_in_quant_boot:
    if tid in distance:
        for i in range(len(df_quant_boot[tid])):
Beispiel #20
0
def doTheThing(inputStream, regionsToCheck=["Jita", "Hek", "Rens"]):
    initializeItems()

    inventory = {}
    invReader = tsv.TsvReader(inputStream)
    for rawRow in invReader:
        row = list(rawRow)
        item = nameToItem[row[0]]
        quantity = int(row[1].replace(",", ""))
        if item.typeId not in inventory:
            inventory[item.typeId] = {"quantity": quantity, "item": item}
        else:
            inventory[item.typeId]["quantity"] += quantity

    reprocessOutputsToConsider = set()

    for invItem in inventory.values():
        for materialTypeId in invItem["item"].reprocessingOutputs.keys():
            reprocessOutputsToConsider.add(materialTypeId)

    # Prefetch everything
    requestMap = {}
    for materialTypeId in reprocessOutputsToConsider:
        jitaRegion = regionMap["Jita"]["regionId"]
        try:
            requestMap[jitaRegion].add(materialTypeId)
        except KeyError:
            requestMap[jitaRegion] = {materialTypeId}

    # for regionId in map(lambda region: region["regionId"], regionMap.values()):
    for regionId in map(lambda region: regionMap[region]["regionId"],
                        regionsToCheck):
        for itemId in inventory.keys():
            try:
                requestMap[regionId].add(itemId)
            except KeyError:
                requestMap[regionId] = {itemId}

    start = time.time()
    orderHistory = runBatch(requestMap)
    print("Offer retrieval time: {}".format(time.time() - start))

    offers = {}
    jitaOffers = {}
    for region in regionsToCheck:
        regionId = regionMap[region]["regionId"]

        for typeId in inventory.keys():
            item = typeIdToItem[typeId]
            response = orderHistory[regionId][typeId]
            try:
                fiveDayAverage = getFiveDayAverage(response)
            except statistics.StatisticsError:
                eprint("Failed to process '{typeId}' in {region}. Continuing.".
                       format_map({
                           "typeId": item.name,
                           "region": region
                       }))
                continue

            if typeId not in offers:
                offers[typeId] = {}

            offers[typeId][region] = {
                "item": item,
                "price": fiveDayAverage,
            }

            if "Jita" == region:
                jitaOffers[typeId] = fiveDayAverage

    reprocessPrices = {}

    for typeId in reprocessOutputsToConsider:
        response = orderHistory[regionMap["Jita"]["regionId"]][typeId]
        fiveDayAverage = getFiveDayAverage(response)
        reprocessPrices[typeId] = fiveDayAverage

    reprocessOffers = {}
    for typeId in inventory.keys():
        reprocessValue = 0
        item = typeIdToItem[typeId]
        if 0 == len(item.reprocessingOutputs):
            continue
        for materialTypeId in item.reprocessingOutputs.keys():
            try:
                unitPrice = reprocessPrices[materialTypeId]
            except KeyError:
                continue

            reprocessValue += (
                1 - REPROCESSING_TAX_RATE
            ) * REPROCESSING_EFFICIENCY * unitPrice * item.reprocessingOutputs[
                materialTypeId]

        reprocessOffers[typeId] = reprocessValue

    bestOffers = {k: [] for k in regionsToCheck}
    for typeId in offers:
        bestRegion = ""
        bestPrice = -1.00

        for region in offers[typeId]:
            currentPrice = offers[typeId][region]["price"]
            if currentPrice > bestPrice:
                bestRegion = region
                bestPrice = currentPrice

        bestOffers[bestRegion].append(offers[typeId][bestRegion])

    results = {}
    for region in bestOffers:
        regionOffers = bestOffers[region]
        if 0 == len(regionOffers):
            continue

        doJitaComparison = "Jita" != region

        headers = ["Item Name", "Unit Price"]
        columnFormatString = ["{}", "{:,.2f}"]
        if doJitaComparison:
            headers.append("Jita")
            columnFormatString.append("{:,.2f} ISK")

        headers.append("Reprocess Value")
        columnFormatString.append("{:,.2f} ISK")
        headers.append("Qty")
        columnFormatString.append("{}")
        headers.append("Estimated Price")
        columnFormatString.append("{:,.2f} ISK")
        tableData = []
        for offer in regionOffers:
            item = offer["item"]
            typeId = item.typeId
            qty = inventory[typeId]["quantity"]
            unitPrice = offer["price"]
            row = [item.name, unitPrice]
            if doJitaComparison:
                row.append(jitaOffers[typeId])
            try:
                row.append(reprocessOffers[typeId])
            except KeyError:
                row.append(None)
            row.append(qty)
            row.append(qty * unitPrice)
            tableData.append(row)

        tableData.sort(key=lambda row: row[-1], reverse=True)

        results[region] = {
            'headers': headers,
            'data': tableData,
            'format': columnFormatString
        }
    return results