Ejemplo n.º 1
0
def main():
    myFile = open('example.tsv', 'w')
    with myFile:
        writer = tsv.TsvWriter(myFile)
        writer.line("first_name", "second_name", "Grade")
        writer.line('Alex', 'Brian', 'A')
        writer.line('Tom', 'Smith', 'B')
Ejemplo n.º 2
0
def main(args):
    """
    Parses command line arguments and do the work of the program.
    "args" specifies the program arguments, with args[0] being the executable
    name. The return value should be used as the program's exit code.
    """
    
    options = parse_args(args) # This holds the nicely-parsed options object
    
    writer = tsv.TsvWriter(options.out_file)
    
    # We use this to keep our state: pairs of SeqRecord and current offset
    batch_state = []
    
    # We use this to keep track of the matches found for each query sequence ID
    matches = collections.defaultdict(set)
    
    # OK so we need to run that update function a bunch and feed in data
    
    for record in Bio.SeqIO.parse(options.query_name, "fasta"):
        # For each record to place
        
        while len(batch_state) >= options.batch_size:
            # Run through existing batches until there's room
            batch_state = run_batch(options, batch_state, matches, writer)
            
        # Start a new fake thread on this record
        batch_state.append((record, 0))
        
    # Now run all the records to completion
    while len(batch_state) > 0:
        batch_state = run_batch(options, batch_state, matches, writer)
        
    return 0
def saveBasesDropped(job, options, return_value_dict, out_key):
    """
    Given a dict from region name, graph name, and sample name to the retrun
    value tuple of convertGlennToVcf, which is (vcf_file_id, pases_dropped),
    save a TSV in the format <region>\t<graph>\t<sample>\t<bases dropped> to the
    given key.
    
    """

    # We need to save the TSV file somewhere
    local_filename = os.path.join(job.fileStore.getLocalTempDir(), "stats.tsv")

    writer = tsv.TsvWriter(open(local_filename, "w"))

    # Make the IOStores
    graph_store = IOStore.get(options.graph_store)
    call_store = IOStore.get(options.call_store)
    out_store = IOStore.get(options.out_store)

    for region_name, by_graph in return_value_dict.iteritems():
        # For each region and the graphs in it
        for graph_name, by_sample in by_graph.iteritems():
            # For each graph and the samples on it
            for sample_name, stats in by_sample.iteritems():
                # For each sample and its return value from saveBasesDropped

                # Save the bases dropped
                writer.line(region_name, graph_name, sample_name, stats[1])

    # Save the aggregated output
    writer.close()
    out_store.write_output_file(local_filename, out_key)
Ejemplo n.º 4
0
def main():
    #Creates a file called example.tsv, "w" means you can write to the file
    myFile = open('example.tsv', 'w')
    with myFile:
        #Writes to the file and includes whatever data you put in here
        writer = tsv.TsvWriter(myFile)
        writer.line("first_name", "second_name", "Grade")
        writer.line('Alex', 'Brian', 'A')
        writer.line('Tom', 'Smith', 'B')
Ejemplo n.º 5
0
def generateMetaTSVFromData(inputPath, outputPath):

    with open(inputPath) as f:
        data = json.load(f)
    writer = tsv.TsvWriter(open(outputPath, "w"))
    print("Generate meta file for", len(data), " entries")
    # writer.comment("id character class")
    writer.line("character\tclass")
    for id, idx in enumerate(data):
        #id #char #class
        item = data[idx]
        writer.line(item[1] + "\t" + item[2])
    writer.close()
Ejemplo n.º 6
0
def generate_qa_sheets(file_path):
    writer = tsv.TsvWriter(open(file_path, 'w'))
    # title
    writer.line('Question', 'Answer', 'Source', 'Metadata')
    with open('../../data/environment.json', 'r') as f:
        environ = json.load(f)
    url_list = environ['url_list']
    for url in url_list:
        results = get_information(url)
        for res in results:
            # if not res['q'] or not res['a'] or not res['s'] or not res['m']:
            #     continue
            writer.line(res['q'], res['a'], res['s'], res['m'])
    writer.close()
def export_shop_info():
    writer = tsv.TsvWriter(open("shop_info.tsv", "w"))
    # 添加评论
    writer.comment("站点名称, 热卖款式:url(链接), product_type(产品类型), address(站点)")
    writer.line("url", "product_type", "address")
    client = MongoClient('127.0.0.1', 27017)
    db = client.seventeen_zwd
    db_name = "station_message"
    for i in list(db[db_name].find()):
        data = []
        data.append(i["url"])
        data.append(i["product_type"])
        data.append(i["address"])
        writer.list_line(data)
    writer.close()
def export_product_info():
    writer = tsv.TsvWriter(open("shop_type_message.tsv", "w"))
    # 添加评论
    writer.comment("市场,热卖档口,档口种类信息:url(链接), type(种类), market(市场), address(站点)")
    writer.line("url", "type", "market", "address")
    client = MongoClient('127.0.0.1', 27017)
    # 连接所需数据库,sf_fy(顺丰-丰眼)为数据库名称,
    db = client.seventeen_zwd
    db_name = "shop_type_message"
    for i in list(db[db_name].find()):
        print json.dumps(i["type"])
        data = []
        data.append(i["url"])
        data.append(json.dumps(i["type"], ensure_ascii=False))
        data.append(json.dumps(i["market"], ensure_ascii=False))
        data.append(i["address"])
        writer.list_line(data)
    writer.close()
Ejemplo n.º 9
0
#!/usr/bin/env python

"""
This example shows that you can write bad data into TSV files
using the 'tsv' module
"""

import tsv

writer = tsv.TsvWriter(open("/tmp/file.tsv", "w"))

writer.line("\t\t\t", "Column 2", 12345)
writer.close()
Ejemplo n.º 10
0
def run(options):
    """
    Do the actual work of the program.
    """
    
    # Set up our BED output
    writer = tsv.TsvWriter(options.output_bed)
    
    # Read the PSL
    for result in Bio.SearchIO.parse(options.input_psl, "blat-psl"):
        for hit in result:
            for hsp in hit:
                # A hit is the equivalent of a PSL line; we're going to make it a BED record
                
                # Pull out the block sizes and starts
                block_sizes = list(hsp.hit_span_all)
                block_starts = list(hsp.query_start_all)
                
                # We need to find out the strand
                strand = "+"
                for fragment in hsp:
                    # Loop over all the fragments. We know they'll all be on the
                    # same strand, because that's how PSL can articulate them.
                    if fragment.query_strand == -1:
                        # We ought to be on the - strand. This means our blocks
                        # are going to be in backwards order.
                        strand = "-"
                        block_sizes.reverse()
                        block_starts.reverse()
                        break
                        
                # Convert starts to be relative to aligned region
                block_starts = [x - hsp.query_start for x in block_starts]
                
                # The first should always be 0
                assert(block_starts[0] == 0)
                
                # BED is: chrom, chromStart, chromEnd, name, score, strand,
                # thickStart, thickEnd, itemRgb, blockCount, blockSizes,
                # blockStarts
                bed_record = [
                    # chrom
                    options.contig or result.id,
                    # chromStart
                    hsp.query_start + options.offset,
                    # chromEnd
                    hsp.query_end + options.offset,
                    # name
                    hit.id,
                    # score
                    0,
                    # strand
                    strand,
                    # thickStart
                    hsp.query_start + options.offset,
                    # thickEnd
                    hsp.query_start + options.offset,
                    # itemRgb
                    "0,0,0", 
                    # blockCount
                    len(hsp),
                    # blockSizes
                    ",".join((str(x) for x in block_sizes)),
                    # blockStarts
                    ",".join((str(x) for x in block_starts))
                ]
                
                writer.list_line(bed_record)
Ejemplo n.º 11
0
def main():

    # Wehre do we find receipts to import
    INBOX_PATH = 'Inbox'
    # Where do we put them?
    DATABASE_PATH = 'Database'

    if not os.path.exists(INBOX_PATH):
        os.makedirs(INBOX_PATH)

    if not os.path.exists(DATABASE_PATH):
        os.makedirs(DATABASE_PATH)

    # All imported receipts will be assigned to today, no matter when they were made
    date_string = time.strftime('%Y/%m/%d')

    # Determine where to put today's receipts
    dest_dir = os.path.join(DATABASE_PATH, date_string)

    # Find all the inbox files
    inbox_files = list(os.listdir(INBOX_PATH))

    # Find all the TXTs
    txt_files = [f for f in inbox_files if f.lower().endswith('.txt')]
    # And the PDFs
    pdf_files = [f for f in inbox_files if f.lower().endswith('.pdf')]

    # TODO: Check for duplicates in different cases of the same extension

    # File them by basename
    txt_by_basename = {f[:-4]: f for f in txt_files}
    pdf_by_basename = {f[:-4]: f for f in pdf_files}

    # Count imported receipts
    import_count = 0

    for basename in txt_by_basename.keys():
        if basename not in pdf_by_basename:
            print("Warning: {} exists but PDF is missing. Skipping!".format(
                txt_by_basename[basename]))

        # Find the pair of files
        txt_filename = os.path.join(INBOX_PATH, txt_by_basename[basename])
        pdf_filename = os.path.join(INBOX_PATH, pdf_by_basename[basename])

        if not os.path.exists(dest_dir):
            # Make sure the destination directory exists
            os.makedirs(dest_dir)

        # Get a new unique receipt ID
        # This is super N^2, but you should not have large N receipts for one day.
        receipt_id = get_new_id(dest_dir)

        # Where do we put the raw OCR text
        dest_txt_filename = os.path.join(dest_dir, '{}.txt'.format(receipt_id))
        # And the processed items
        dest_tsv_filename = os.path.join(dest_dir, '{}.tsv'.format(receipt_id))
        # And the PDF
        dest_pdf_filename = os.path.join(dest_dir, '{}.pdf'.format(receipt_id))

        # Count the items
        item_count = 0
        # And the price
        total_price = 0.0

        # Create the TSV
        with open(txt_filename, 'r') as text_in:
            with open(dest_tsv_filename, 'w') as tsv_out:
                # Prepare a TSV writer
                writer = tsv.TsvWriter(tsv_out)

                for item, price in parse_items(text_in):
                    # Save each item and its price
                    writer.line(item, str(price))

                    item_count += 1
                    if not math.isnan(price):
                        total_price += price

        # Move the other files
        move(txt_filename, dest_txt_filename)
        move(pdf_filename, dest_pdf_filename)

        print('Imported {} items with total price {} as {}'.format(
            item_count, total_price, receipt_id))

        import_count += 1

    print("Imported {} receipts".format(import_count))
Ejemplo n.º 12
0
def main(arguments):

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('original_input',
                        help="Input file",
                        type=argparse.FileType('r'))
    parser.add_argument('--seq2sick', default=False, action="store_true")
    parser.add_argument('--wordadver', default=False, action="store_true")
    parser.add_argument('--extra_input',
                        help="Input file",
                        type=argparse.FileType('r'))
    parser.add_argument('--output_distance_type',
                        type=str,
                        default="infersent-cosine")
    parser.add_argument('--input_distance_type',
                        type=str,
                        default="infersent-cosine")
    parser.add_argument('--save', default=False, action="store_true")
    parser.add_argument('--plot', default=False, action="store_true")
    parser.add_argument('--get_bootstrap', default=False, action="store_true")
    parser.add_argument('--get_samples', default=False, action="store_true")

    args = parser.parse_args(arguments)

    print(args)
    if args.seq2sick:
        lines = args.original_input.readlines()
        seq2sick_lines = args.extra_input.readlines()
        real_ins = []
        real_outs = []
        adv_ins = []
        adv_outs = []
        for line, seq2sickline in zip(lines, seq2sick_lines):
            adversarial_input, adversarial_output, normal_output = seq2sickline.split(
                "\t")
            adversarial_input = adversarial_input.strip()
            adversarial_output = adversarial_output.strip()
            normal_output = normal_output.strip()
            line = line.strip()
            adv_ins.append(adversarial_input)
            adv_outs.append(adversarial_output)
            real_outs.append(normal_output)
            real_ins.append(line)
    elif args.wordadver:
        lines = args.original_input.readlines()

        reals = lines[0:][::2]
        adversarials = lines[1:][::2]

        real_ins, real_outs = process_lines(reals)
        adv_ins, adv_outs = process_lines(adversarials)
    else:
        raise NotImplementedError

    if args.get_bootstrap:
        gain_boostrap_mean, gain_boostrap_std = calculate_real_gain(
            real_ins, real_outs, 1000, args.input_distance_type,
            args.output_distance_type, model)
        print("Real bootstrap region: {} +/- {}".format(
            gain_boostrap_mean, gain_boostrap_std))

        adversarial_boostrap_mean, adversarial_boostrap_std = calculate_adversarial_gain(
            real_ins, real_outs, adv_ins, adv_outs, args.input_distance_type,
            args.output_distance_type, model)

        print("Adversarial Bootstrap region: {} +/ {}".format(
            adversarial_boostrap_mean, adversarial_boostrap_std))
    # TODO: generate graph from some real samples

    if args.plot:
        input_distances, output_distances, gains = calculate_adversarial_gain_details(
            real_ins, real_outs, adv_ins, adv_outs, args.input_distance_type,
            args.output_distance_type, model)

        gains = gains
        gains = normalize_array(np.clip(np.array(gains), -100, 100))
        sizes = [np.pi * (4.0 + x * 10)**2 for x in gains]
        # cmap = plt.cm.rainbow

        cmap = plt.get_cmap('inferno')
        norm = matplotlib.colors.Normalize()
        colors = cmap(norm(gains))
        fig = plt.figure(figsize=(16, 8))
        ax = fig.add_subplot(1, 1, 1)
        plt.xlim(xmax=.08)
        plt.ylim(ymax=1.01, ymin=-.01)
        axis_font = {'fontname': 'Arial', 'size': '32'}
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontname(axis_font['fontname'])
            label.set_fontsize(axis_font['size'])
        plt.title("Seq2Sick Adversary on GigaWord", **axis_font)
        plt.xlabel("Input Distance" + "\n" + "(InferSent-Cosine)", **axis_font)
        plt.ylabel(r"Output Distance" + "\n" + "(InferSent-Cosine)",
                   **axis_font)
        plt.scatter(input_distances,
                    output_distances,
                    s=sizes,
                    c=colors,
                    alpha=0.65,
                    edgecolors='none')
        if args.save:
            fig.savefig('seq2sick_adversary.pdf',
                        dpi=fig.dpi,
                        bbox_inches='tight')
        else:
            plt.show()

    if args.get_samples:
        input_distances, output_distances, gains = calculate_adversarial_gain_details(
            real_ins, real_outs, adv_ins, adv_outs, args.input_distance_type,
            args.output_distance_type, model)
        import tsv
        most_gain_samples = np.array(gains).argsort()[:][::-1]
        writer = tsv.TsvWriter(open("samples.tsv", "w"))
        writer.list_line([
            "input", "output", "adv_input", "adv_output", "d_in", "d_out",
            "gain"
        ])
        for z in most_gain_samples:
            col = (real_ins[z], real_outs[z], adv_ins[z], adv_outs[z],
                   input_distances[z], output_distances[z], gains[z])
            writer.list_line(col)

        writer.close()
Ejemplo n.º 13
0
def main(args):
    """
    Parses command line arguments and do the work of the program.
    "args" specifies the program arguments, with args[0] being the executable
    name. The return value should be used as the program's exit code.
    """

    if len(args) == 2 and args[1] == "--test":
        # Run the tests
        return doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)

    options = parse_args(args)  # This holds the nicely-parsed options object

    # Set Entrez e-mail
    Entrez.email = options.email

    # Go get the region of the reference we're talking about. Starts and ends
    # are 1-based.
    ref_acc, ref_start, ref_end = get_region_info(options.region,
                                                  options.assembly_url)

    # Make our output directory
    if not os.path.exists(options.region):
        os.makedirs(options.region)

    # We're going to write a chrom.sizes file with accessions (not the GI
    # numbers) for the gff3->psl conversion step
    acc_chrom_sizes = tsv.TsvWriter(
        open(options.region + "/acc.chrom.sizes", "w"))

    # Get the reference's GI
    ref_gi = get_gi_number(ref_acc)

    print("Reference for {} is GI{}:{}-{} 1-based".format(
        options.region, ref_gi, ref_start, ref_end))

    # Grab the reference sequence
    ref_seq = get_sequence(ref_gi, ref_start, ref_end)

    print("Got {}bp for a {}bp reference".format(len(ref_seq),
                                                 ref_end - ref_start + 1))

    if len(ref_seq) > ref_end - ref_start + 1:
        # Clip it down if it's too long. Assuming we have the correct sort of
        # coordinates, and that we got served the data starting at the correct
        # offset.
        ref_seq = ref_seq[0:ref_end - ref_start + 1]
    elif len(ref_seq) < ref_end - ref_start:
        raise RuntimeError("Didn't get enough sequence from the API!")

    # Change it to be just called "ref"
    ref_seq.id = "ref"

    # Write it to <region>/ref.fa
    SeqIO.write([ref_seq], open("{}/ref.fa".format(options.region), "w"),
                "fasta")

    # Write a chromosome size entry for the reference by its accession
    acc_chrom_sizes.line(ref_acc, get_length(ref_gi))

    print("Writing genes for ref")

    # Make a BED to put reference genes in
    ref_bed = open_gene_bed(options.region, "ref")

    for line in get_genes(ref_acc, "ref", ref_start, ref_end):
        # Write all the BED lines for the appropriate region of the reference to
        # that file.
        ref_bed.write(line)

    ref_bed.close()

    for alt_acc, alt_unit in get_region_sequences(options.region,
                                                  options.assembly_url):
        # For every alt in the region

        # Get its GI number
        alt_gi = get_gi_number(alt_acc)

        print("Downloading alt GI{}".format(alt_gi))

        # Grab the sequence data
        alt_seq = get_sequence(alt_gi)

        # Write it to <region>/GI<number>.fa
        SeqIO.write([alt_seq],
                    open("{}/GI{}.fa".format(options.region, alt_gi), "w"),
                    "fasta")

        # Add this alt to the chromosome-sizes-by-accession file
        acc_chrom_sizes.line(alt_acc, get_length(alt_gi))
        # Sneak into the TSV writer and flush, so the sizes file can now be
        # read.
        acc_chrom_sizes.stream.flush()

        # Where should we put the GFF alignment for this alt to the reference?
        alt_gff3 = "{}/GI{}.gff3".format(options.region, alt_gi)

        print("Downloading alignment")

        # Go download it
        download_gff3(ref_acc, alt_acc, alt_unit, options.assembly_url,
                      alt_gff3)

        # And we need to convert that to PSL
        alt_psl = "{}/GI{}.psl".format(options.region, alt_gi)

        print("Converting to PSL")

        # Run the conversion with the bit of the sizes file we have so far. We
        # need to pass the chrom.sizes file twice now because gff3ToPsl has
        # changed its interface.
        subprocess.check_call([
            "gff3ToPsl", options.region + "/acc.chrom.sizes",
            options.region + "/acc.chrom.sizes", alt_gff3, alt_psl
        ])

        # Edit the output to point to the GI instead of the accession
        subprocess.check_call(
            ["sed", "-i", "s/{}/GI{}/g".format(alt_acc, alt_gi), alt_psl])

        print("Writing genes for GI{}".format(alt_gi))

        # Make a BED to put alt genes in
        alt_bed = open_gene_bed(options.region, "GI{}".format(alt_gi))

        for line in get_genes(alt_acc,
                              "GI{}".format(alt_gi),
                              alt_parent_grc_id=ref_acc):
            # Write all the BED lines for the alt to the file
            alt_bed.write(line)

        alt_bed.close()

    # Now we need to do psl2maf, complete with globbing.

    print("Creating GRC MAF")

    # Find the psl2maf.py script
    psl2maf = (os.path.dirname(os.path.realpath(__file__)) + "/psl2maf.py")

    # Go call psl2maf, moving the reference stuff over to "ref" and shifting it
    # back so that the first base we clipped out of the reference is 0,
    # splitting apart mismatches, and making sure to use all the PSLs and MAFs
    # in our output directory. We make sure to add 1 to the reference start in
    # the offset, because some basedness-conversion needs to happen. TODO: Make
    # this a function or make this use an import or somehow de-uglify it.
    args = ([
        psl2maf, "--maf", options.region + "/GRCAlignment.maf",
        "--referenceOffset",
        str(-ref_start + 1), "--referenceSequence", "ref", "--noMismatch",
        "--psls"
    ] + glob.glob(options.region + "/*.psl") + ["--fastas"] +
            glob.glob(options.region + "/*.fa"))

    print("Calling: {}".format(" ".join(args)))

    subprocess.check_call(args)
Ejemplo n.º 14
0
def mutual_information_statistics(layers, layer_names, ctx, options):
    """    
    For every binary or continuous layer and for every layout, we need a
    mi_<layer number>_<layout number>.tab file, with scores associating that
    layer with other layers of its type, in <other layer name>\t<score>
    format. This uses mutual information (really, normalized redundancy)
    instead of the statistical tests above to produce a score between 0 and 1,
    with higher being more mutual information.
    """
    # We're going to need a mapping from layer name to layer index.
    layer_indices = {name: i for i, name in enumerate(layer_names)}
    
    for layout_index in ctx.all_hexagons.iterkeys():
        # We look at all layouts for this.
        # We assume layout doesn't somehow change layer types.
        
        # Get windows in this layout. Doesn't really matter what order they're
        # in, since we only compare within layouts. Keep the threshold used
        # above.
        curated_windows = window_tool(
            options.mi_window_size,
            options.mi_window_size,
            ctx,
            threshold=options.mi_window_threshold,
            layout_index=layout_index
        )

        # This will hold per-window discrete values for each layer for which we
        # are computing mutual information statistics. For binary layers, it is
        # the sum of the number of ones in the window. For non-binary layers, it
        # is the histogram bin number in which the window's average value falls,
        # or a past-the-last-bin number for an empty window with a NaN average.
        # This is indexed by layer name, referencing the window values for that
        # layer.
        layer_window_values = {}
       
        for layer_name in ctx.binary_layers:
        
            # For binary layers, get the sum for each window. But use 0 for
            # hexes that don't have values in a certain layer. Also
            # (re)discretize by binning as for continuous below.

            # This holds sums for each layer, for each window.
            window_sums = [
                sum (
                    (
                        layers[layer_name][hexagon]
                        if layers[layer_name].has_key(hexagon) else 0
                    )
                    for hexagon in window
                )
                for window in curated_windows
            ]
            """
            window_sums = [sum((layers[layer_name][hexagon]
                if layers[layer_name].has_key(hexagon) else 0)
                for hexagon in window)
                for window in curated_windows]
            """

            #print 'window_sums:', len(window_sums)
            #pprint.pprint(window_sums)

            if options.mi_binary_binning:
                # We want to bin counts
                
                # Now we have our list of the sum values for each window.
                # Histogram bin the non-NaN values. See
                # <https://gist.github.com/elsonidoq/4230222>
                _, bins = numpy.histogram(
                    [
                        total for total in window_sums
                        if not math.isnan(total)
                    ]
                )
                #_, bins = numpy.histogram([total for total in window_sums
                #    if not math.isnan(total)])

                # Work out the bin numbers for all the totals (NaN windows get the
                # past-the-end bin)
                layer_window_values[layer_name] = numpy.digitize(window_sums,
                    bins)
            else:
                # Don't bin counts.
                layer_window_values[layer_name] = window_sums
        """
        TODO skip continuous for now
        for layer_name in ctx.continuous_layers:
            
            # For continuous layers, get the average for each window, but
            # discretize using histogram bin number.
            
            # This holds averages for each window.
            window_averages = []          
                        
            for window in curated_windows:
                # Compute the sum of the layer in the window
                window_sum = 0
                # And the number of hexes with values involved
                window_values = 0
                
                for hexagon in window:
                    if layers[layer_name].has_key(hexagon):
                        # Sum up over all the hexagons in this window with
                        # values for this layer
                        window_sum += layers[layer_name][hexagon]
                        window_values += 1
                
                if window_values == 0:
                    # Can't take the average Use NaN
                    window_averages.append(float("NaN"))
                else:
                    # Use the average like we're supposed to
                    # TODO: do we need float() here?
                    window_averages.append(float(window_sum) / window_values)
                    
            # Now we have our list of the average values for each window.
            # Histogram bin the non-NaN values. See
            # <https://gist.github.com/elsonidoq/4230222>
            _, bins = numpy.histogram([average for average in window_averages 
                if not math.isnan(average)])
                
            # Work out the bin numbers for all the averages (NaN windows get the
            # past-the-end bin)
            layer_window_values[layer_name] = numpy.digitize(window_averages,
                bins)
        """
        pairs_to_run = len(layer_window_values) ** 2 - len(layer_window_values)  # without compare to self
        print timestamp(), "{} pairs to run".format(pairs_to_run)

        # What layer are we writing the file for?
        current_first_layer = None
        # Where are we writing it to?
        information_writer = None
        
        # How many pairs have we done?
        pair = 0

        #print('layer_window_values')
        #pprint.pprint(layer_window_values);

        message_count = 1
        
        for (layer_a, layer_b, redundancy) in mutualInfo.all_pairs (
            layer_window_values):
            
            # Go get mutual information for each pair of layers, grouped by the
            # first layer.
            
            if layer_a != current_first_layer:
                # We're changing first layers.
                
                if information_writer is not None:
                    # Close the previous file.
                    information_writer.close()
                    
                # Open a tsv writer for the new first layer's redundancies with
                # everyone else.
                information_writer = tsv.TsvWriter(open(os.path.join(
                    options.directory, "mi_{}_{}.tab".format(layout_index, 
                    layer_indices[layer_a])), "w"))
                    
                # Record that we're on that layer as the first layer now.
                current_first_layer = layer_a
                
                
            # Make a line for redundancy with this other layer.
            information_writer.line(layer_b, str(redundancy))
            
            pair += 1

            # Log a progress message for every ~1/30th of pairs processed
            if pair > pairs_to_run * message_count / 30:
                print timestamp(), str(message_count) + '/30 of', pairs_to_run, 'pairs'
                sys.stdout.flush()
                message_count += 1

        print timestamp(), "{} pairs processed".format(pair)
            
        if information_writer is not None:
                    # Close the last file.
                    information_writer.close()
Ejemplo n.º 15
0
import tsv

print('\n\n\t\tScraping Data to text file...\n\n')
data = open(
    'E:/Freelancing Project/abusive/Abusive_Language_Detector/Abusive_Language_Detector/data/text.txt',
    'r',
    encoding='utf8').read()
data = data.replace('ред', '.')
print('Scraped Data: ', data)

blob = TextBlob(data)

workbook = xlsxwriter.Workbook('../output/output.xlsx')
worksheet = workbook.add_worksheet()

writer = tsv.TsvWriter(open("../data/test.tsv", "w"))

writer.line("test_id", "comment")
worksheet.write(0, 0, 'Content ID')
worksheet.write(0, 1, 'Bangla Text')
worksheet.write(0, 2, 'English Text')
worksheet.write(0, 3, 'Prediction')

print('\n\n\t\tSplitting Data to .tsv file...\n\n')

i = 1
print('For each sentence in paragraph:\n\n')
for sentence in blob.sentences:
    en = str(sentence.translate(to='en'))
    print(i, '. ', sentence, ' - ', en)
    worksheet.write(i, 0, i)
def main(args):
    ##### STEP 1: CNN architecture #####
    cnn = CNN()
    print(cnn)
    if is_cuda:
        cnn.cuda()

    optimizer = torch.optim.Adam(cnn.parameters(),
                                 lr=LR)  # optimize all cnn parameters
    loss_func = nn.BCEWithLogitsLoss()
    # OR Hamming Loss
    # Alternatives: Focal Lost for imbalanced data: https://gombru.github.io/2018/05/23/cross_entropy_loss/
    # Also see: https://discuss.pytorch.org/t/how-to-implement-focal-loss-in-pytorch/6469/17

    ##### STEP 2: Data set #####
    all_data = CustomDatasetFromImages(args.data)
    train_data = all_data

    # Data Loader for easy mini-batch return in training, the image batch shape will be (50, 1, 28, 28)
    train_loader = Data.DataLoader(dataset=train_data,
                                   batch_size=BATCH_SIZE,
                                   shuffle=True)
    test_data = CustomDatasetFromImages(args.testData)

    all_x = Variable(torch.unsqueeze(all_data.image_tensors,
                                     dim=1)).type(torch.FloatTensor)
    test_x = Variable(torch.unsqueeze(test_data.image_tensors,
                                      dim=1)).type(torch.FloatTensor)
    # test_y = Variable(torch.unsqueeze(test_data.labels, dim=1)).type(torch.FloatTensor)

    ##### STEP 3: Training and testing #####
    for epoch in range(EPOCH):
        for step, (
                char_index, char, img_as_tensor, label_index_tensor,
                label) in enumerate(
                    train_loader
                ):  # gives batch data, normalize x when iterate train_loader
            if is_cuda:
                label_index_tensor = label_index_tensor.cuda()
                img_as_tensor = img_as_tensor.cuda()
            input = img_as_tensor
            output, embeddings = cnn(input)  # cnn output
            loss = loss_func(output, label_index_tensor)  # loss function
            optimizer.zero_grad()  # clear gradients for this training step
            loss.backward()  # backpropagation, compute gradients
            optimizer.step()  # apply gradients

            if step % 50 == 0 or step == 148:
                if is_cuda:
                    test_x = test_x.cuda()
                test_output, last_layer = cnn(test_x)

                pred_y = torch.max(test_output.cpu(), 1)[1].data.numpy()

                correct = 0
                for i in range(len(test_data.labels)):
                    labels = test_data.labels[i]
                    if (pred_y[i] in labels):
                        correct += 1

                accuracy = float(correct) / float(len(test_data.labels))
                print('Epoch: ', epoch, step,
                      '| train loss: %.4f' % loss.data.cpu().numpy(),
                      '| test accuracy: %.3f' % accuracy)

        if is_cuda:
            all_x = all_x.cuda()
        test_output, embeddings = cnn(all_x)

        # Comment out if we need to save embeddings during the training process
        # with open(str(epoch) + "_" + args.output +".pkl", 'wb') as output:  # Overwrites any existing file.
        #     if is_cuda:
        #         embeddings = embeddings.cpu()
        #     pickle.dump(embeddings, output, pickle.HIGHEST_PROTOCOL)
        #     print("Embedding saved")

    writer = tsv.TsvWriter(open(args.output, "w"))
    output = embeddings.cpu()
    for idx, line in enumerate(output):
        if idx % 100 == 0: print(idx)
        s = ""
        for n in line:
            n = n.detach().numpy()
            s += str(n) + "\t"
        writer.line(s)
    writer.close()
    print("Embedding saved")
Ejemplo n.º 17
0
# Trial #2
file_read = '../f_model2.pickle.gz'
# file_write = 'embedding_2.tsv'

file_write = 'embedding_grey_fc2.tsv'

f = gzip.open(file_read)  # 'r' for reading; can be omitted
mydict = pickle.load(f)  # load file content as mydict
f.close()

print mydict.keys()

print mydict['input_font_bottleneck.W']
print len(mydict['input_font_bottleneck.W'])
print mydict['input_font_bottleneck.W'][0]
print len(mydict['input_font_bottleneck.W'][0])

# print mydict['output_sigmoid.W']
# print mydict['output_sigmoid.W'].shape
# # print len(mydict['dense_0.b'])
# # print mydict['dense_0.b'][0]
# # print len(mydict['dense_0.b'][0])

writer = tsv.TsvWriter(open(file_write, "w"))

for row in mydict['input_font_bottleneck.W']:
    writer.list_line(row)

writer.close()
Ejemplo n.º 18
0
def process_raw_data(raw_data, old_html_dir, options):
    """
    This function receives the file containing raw genomic data that the user
    wants to map to the pre-existing visulization & the location of
    pre-existing visualization files. We will parse this new data file
    placing the rows in an order defined by the genes tab from the pre-existing
    visualization. This way we generate a mutable numpy matrix of raw patient
    data and have the genes in the required by the transform matrix,
    U^T, & S matrices.
    """
    # Create the file paths for the required files
    genes_file_loc = os.path.join(old_html_dir, "genes.tab")
    s_matrix_file_loc = os.path.join(old_html_dir, "S.tab")
    u_t_matrix_file_loc = os.path.join(old_html_dir, "U_T.tab")
    beta_matrix_file_loc = os.path.join(old_html_dir, "beta.tab")
    assignments_file_loc = os.path.join(old_html_dir, "assignments0.tab")

    # First open the genes file.
    genes_reader = tsv.TsvReader(open(genes_file_loc, 'r'))

    # This holds an iterator over lines in that file
    genes_iterator = genes_reader.__iter__()

    # Extract data type of the pre-existing visualization & the list of genes
    old_data_type = genes_iterator.next()
    print("Previous Data Type", old_data_type)

    # First see of the new data and the old data are of compatible data types
    new_data_type = options.type
    old_genes_list = []
    # If they are the same data type add the genes to a python list
    if old_data_type[0] == new_data_type:
        print("Same Data Types")
        old_genes_list = genes_iterator.next()
        genes_reader.close()

        # First open the raw data file.
        raw_data_reader = tsv.TsvReader(open(raw_data, 'r'))
        # This holds an iterator over lines in that file
        raw_data_iterator = raw_data_reader.__iter__()

        sample_names = raw_data_iterator.next()
        sample_names = sample_names[1:]
        num_samples = len(sample_names)
        new_genes_list = []
        for row in raw_data_iterator:
            new_gene = row[0]
            new_genes_list.append(new_gene)
        raw_data_reader.close()

        # Get the number of new samples & number of old genes to create
        # a new numpy data matrix
        print("Number of New Samples:", num_samples)
        num_new_genes = len(new_genes_list)
        print("Number of New genes:", num_new_genes)

        # Re-Initialize the data iterator
        # This holds an iterator over lines in that file
        raw_data_reader = tsv.TsvReader(open(raw_data, 'r'))
        raw_data_iterator = raw_data_reader.__iter__()
        # Skip the first line which is simple a row of headers
        raw_data_iterator.next()

        # Next we have to dump all the valus from the file into a numpy matrix
        # The values will be unsorted. We will then have to sort the rows of the
        # numpy matrix according to the order prescribed by old_genes_list
        raw_data_matrix_unsorted = numpy.zeros(shape=(num_new_genes,
                                                      num_samples))
        for rindex, row in enumerate(raw_data_iterator):
            # Cut off the first value of each row. It is simply the gene name.
            only_values = row[1:]
            # Place the data from only_values into the appropriate row in
            # raw_data_matrix.
            for cindex, col in enumerate(only_values):
                raw_data_matrix_unsorted[rindex][cindex] = only_values[cindex]

        # For every gene in old_genes_list search the new_genes_list for the
        # the appropriate index. Then use this index to find the values in
        # the unsorted data matrix and copy them a new sorted matrix.
        # This new matrix will be used the compute the (x,y) coordinates
        # needed to map the new samples.
        num_old_genes = len(old_genes_list)

        #Debugging
        num_no_data = 0

        raw_data_matrix_sorted = numpy.zeros(shape=(num_old_genes,
                                                    num_samples))
        for rindex, gene in enumerate(old_genes_list):
            # Find the index of the desired gene in the new_genes_list
            # This index will corrrespond to the row in the raw_data_matrix_unsorted
            # that we want to extract and place in the raw_data_matrix_sorted
            try:
                gene_index = new_genes_list.index(gene)
                extracted_data_row = raw_data_matrix_unsorted[gene_index]
                # Iterate over the extracted row to place the values in the appropriate row
                # of the sorted data matrix.
                for cindex, col in enumerate(extracted_data_row):
                    raw_data_matrix_sorted[rindex][
                        cindex] = extracted_data_row[cindex]
            except ValueError:
                num_no_data += 1
        print("Number of genes with no data", num_no_data)

        # Open up S matrix, U^T, and Betas for x,y coordinate computation
        # First open the matrix file.
        s_reader = tsv.TsvReader(open(s_matrix_file_loc, 'r'))
        u_t_reader = tsv.TsvReader(open(u_t_matrix_file_loc, 'r'))
        beta_reader = tsv.TsvReader(open(beta_matrix_file_loc, 'r'))

        # Next create iterators to traverse the files
        s_iterator = s_reader.__iter__()
        u_t_iterator = u_t_reader.__iter__()
        beta_iterator = beta_reader.__iter__()

        # Create an array for s_values & create a diagonal matrix from it
        s_values = s_iterator.next()
        float_s_values = []
        for value in s_values:
            v = float(value)
            float_s_values.append(v)
        s_values = float_s_values

        print("S_values", s_values)
        s_diag = numpy.diag(s_values)
        print(s_diag)

        # Create a numpy matrix for u_t (number of principal components * number of genes)
        u_t = numpy.zeros(shape=(len(s_values), num_old_genes))
        for rindex, row in enumerate(u_t_iterator):
            for cindex, col in enumerate(row):
                u_t[rindex][cindex] = float(row[cindex])

        # Create a numpy matrix for the betas (number of principal components * 2)
        betas = numpy.zeros(shape=(len(s_values), 2))
        for rindex, row in enumerate(beta_iterator):
            for cindex, col in enumerate(row):
                betas[rindex][cindex] = float(row[cindex])
        betas = numpy.transpose(betas)

        # Compute new coordinates
        coords = betas * (numpy.asmatrix(s_diag) * numpy.asmatrix(u_t) *
                          numpy.asmatrix(raw_data_matrix_sorted))
        print("Coordinates")
        print(coords)

        coords = numpy.transpose(coords)
        # Add to existing "assignments.tab" file
        assignments_writer = tsv.TsvWriter(open(assignments_file_loc, 'a'))
        for rindex, sample in enumerate(sample_names):
            print("Cindex", cindex)
            x = str(coords[rindex, 0])
            y = str(coords[rindex, 1])
            print(sample, x, y)
            assignments_writer.line(sample, x, y)

        assignments_writer.close()

    else:
        raise Exception("Pre-existing Visualization employs ", old_data_type,
                        " data. Data to me mapped is of ", new_data_type,
                        ". Data Types must be the same.")

    return True
Ejemplo n.º 19
0
def main():
    cnn = CNN(300) #embedding_size
    print(cnn)  # net architecture
    cnn.cuda()

    # torch.manual_seed(1)    # reproducible

    # Hyper Parameters
    EPOCH = 5              # train the training data n times, to save time, we just train 1 epoch
    BATCH_SIZE = 64
    LR = 1e-3           # learning rate
    # CLASSES =

    train_data = CustomDatasetFromImages(args.data)
    all_data = CustomDatasetFromImages(args.data)
    print("All:", all_data.data_len)
    # Cat: 123
    # Total:7351

    # plot one example
    # print(train_data.size())                 # (60000, 28, 28)
    # plt.imshow(train_data.train_data[0].numpy(), cmap='gray')
    # plt.title('%i' % train_data.train_labels[0])
    # plt.show()

    # Data Loader for easy mini-batch return in training, the image batch shape will be (50, 1, 28, 28)
    train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
    # pick 2000 samples to speed up testing

    test_data = CustomDatasetFromImages("data/VC/test.json")

    all_x = Variable(torch.unsqueeze(all_data.image_tensors, dim=1)).type(torch.FloatTensor)
    test_x = Variable(torch.unsqueeze(test_data.image_tensors, dim=1)).type(torch.FloatTensor)
    test_y = Variable(torch.unsqueeze(test_data.labels, dim=1)).type(torch.FloatTensor)

    # test_x = torch.unsqueeze(test_data.test_data, dim=1).type(torch.FloatTensor)[:2000]/255.   # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
    # test_y = test_data.test_labels[:2000]

    optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)   # optimize all cnn parameters
    loss_func = nn.CrossEntropyLoss()                       # the target label is not one-hotted

    # following function (plot_with_labels) is for visualization, can be ignored if not interested
    from matplotlib import cm
    # try: from sklearn.manifold import TSNE; HAS_SK = True
    # except: HAS_SK = False; print('Please install sklearn for layer visualization')

    def plot_with_labels(lowDWeights, labels):
        plt.cla()
        X, Y = lowDWeights[:, 0], lowDWeights[:, 1]
        for x, y, s in zip(X, Y, labels):
            c = cm.rainbow(int(255 * s / 9)); plt.text(x, y, s, backgroundcolor=c, fontsize=9)
        plt.xlim(X.min(), X.max()); plt.ylim(Y.min(), Y.max()); plt.title('Visualize last layer'); plt.show(); plt.pause(0.01)

    plt.ion()
    # training and testing
    for epoch in range(EPOCH):
        for step, (char_index, char, img_as_tensor, label_index, label) in enumerate(train_loader):   # gives batch data, normalize x when iterate train_loader
            input = img_as_tensor.cuda()
            label_index = label_index.cuda()

            # print(input.is_cuda)
            output, embeddings = cnn(input)  # cnn output
            # print(output.is_cuda, label_index.is_cuda)
            loss = loss_func(output, label_index)   # cross entropy loss
            optimizer.zero_grad()           # clear gradients for this training step
            loss.backward()                 # backpropagation, compute gradients
            optimizer.step()                # apply gradients

            if step % 50 == 0:
                test_x = test_x.cuda()
                test_output, last_layer = cnn(test_x)

                pred_y = torch.max(test_output.cpu(), 1)[1].data.numpy()

                accuracy = float((pred_y == test_y.data.numpy()).astype(int).sum()) / float(test_y.size(0))

                print('Epoch: ', epoch, step, '| train loss: %.4f' % loss.data.cpu().numpy(), '| test accuracy: %.2f' % accuracy)
                # if HAS_SK:
                #     # Visualization of trained flatten layer (T-SNE)
                #     tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
                #     plot_only = 500
                #     low_dim_embs = tsne.fit_transform(last_layer.data.numpy()[:plot_only, :])
                #     labels = test_y.numpy()[:plot_only]
                #     plot_with_labels(low_dim_embs, labels)
        all_x = all_x.cuda()
        test_output, embeddings = cnn(all_x)

        with open(str(epoch) + "_" + args.output +".pkl", 'wb') as output:  # Overwrites any existing file.
            embeddings = embeddings.cpu()
            print(embeddings.is_cuda)
            pickle.dump(embeddings, output, pickle.HIGHEST_PROTOCOL)
            print("Embedding saved")

    plt.ioff()

    writer = tsv.TsvWriter(open(args.output + ".tsv", "w"))
    output = embeddings.cpu()
    for idx, line in enumerate(output):
        if idx%100 == 0: print(idx)
        s = ""
        for n in line:
            n = n.detach().numpy()
            s += str(n) + "\t"
        writer.line(s)
    writer.close()
    print("Embedding saved")