def main(): myFile = open('example.tsv', 'w') with myFile: writer = tsv.TsvWriter(myFile) writer.line("first_name", "second_name", "Grade") writer.line('Alex', 'Brian', 'A') writer.line('Tom', 'Smith', 'B')
def main(args): """ Parses command line arguments and do the work of the program. "args" specifies the program arguments, with args[0] being the executable name. The return value should be used as the program's exit code. """ options = parse_args(args) # This holds the nicely-parsed options object writer = tsv.TsvWriter(options.out_file) # We use this to keep our state: pairs of SeqRecord and current offset batch_state = [] # We use this to keep track of the matches found for each query sequence ID matches = collections.defaultdict(set) # OK so we need to run that update function a bunch and feed in data for record in Bio.SeqIO.parse(options.query_name, "fasta"): # For each record to place while len(batch_state) >= options.batch_size: # Run through existing batches until there's room batch_state = run_batch(options, batch_state, matches, writer) # Start a new fake thread on this record batch_state.append((record, 0)) # Now run all the records to completion while len(batch_state) > 0: batch_state = run_batch(options, batch_state, matches, writer) return 0
def saveBasesDropped(job, options, return_value_dict, out_key): """ Given a dict from region name, graph name, and sample name to the retrun value tuple of convertGlennToVcf, which is (vcf_file_id, pases_dropped), save a TSV in the format <region>\t<graph>\t<sample>\t<bases dropped> to the given key. """ # We need to save the TSV file somewhere local_filename = os.path.join(job.fileStore.getLocalTempDir(), "stats.tsv") writer = tsv.TsvWriter(open(local_filename, "w")) # Make the IOStores graph_store = IOStore.get(options.graph_store) call_store = IOStore.get(options.call_store) out_store = IOStore.get(options.out_store) for region_name, by_graph in return_value_dict.iteritems(): # For each region and the graphs in it for graph_name, by_sample in by_graph.iteritems(): # For each graph and the samples on it for sample_name, stats in by_sample.iteritems(): # For each sample and its return value from saveBasesDropped # Save the bases dropped writer.line(region_name, graph_name, sample_name, stats[1]) # Save the aggregated output writer.close() out_store.write_output_file(local_filename, out_key)
def main(): #Creates a file called example.tsv, "w" means you can write to the file myFile = open('example.tsv', 'w') with myFile: #Writes to the file and includes whatever data you put in here writer = tsv.TsvWriter(myFile) writer.line("first_name", "second_name", "Grade") writer.line('Alex', 'Brian', 'A') writer.line('Tom', 'Smith', 'B')
def generateMetaTSVFromData(inputPath, outputPath): with open(inputPath) as f: data = json.load(f) writer = tsv.TsvWriter(open(outputPath, "w")) print("Generate meta file for", len(data), " entries") # writer.comment("id character class") writer.line("character\tclass") for id, idx in enumerate(data): #id #char #class item = data[idx] writer.line(item[1] + "\t" + item[2]) writer.close()
def generate_qa_sheets(file_path): writer = tsv.TsvWriter(open(file_path, 'w')) # title writer.line('Question', 'Answer', 'Source', 'Metadata') with open('../../data/environment.json', 'r') as f: environ = json.load(f) url_list = environ['url_list'] for url in url_list: results = get_information(url) for res in results: # if not res['q'] or not res['a'] or not res['s'] or not res['m']: # continue writer.line(res['q'], res['a'], res['s'], res['m']) writer.close()
def export_shop_info(): writer = tsv.TsvWriter(open("shop_info.tsv", "w")) # 添加评论 writer.comment("站点名称, 热卖款式:url(链接), product_type(产品类型), address(站点)") writer.line("url", "product_type", "address") client = MongoClient('127.0.0.1', 27017) db = client.seventeen_zwd db_name = "station_message" for i in list(db[db_name].find()): data = [] data.append(i["url"]) data.append(i["product_type"]) data.append(i["address"]) writer.list_line(data) writer.close()
def export_product_info(): writer = tsv.TsvWriter(open("shop_type_message.tsv", "w")) # 添加评论 writer.comment("市场,热卖档口,档口种类信息:url(链接), type(种类), market(市场), address(站点)") writer.line("url", "type", "market", "address") client = MongoClient('127.0.0.1', 27017) # 连接所需数据库,sf_fy(顺丰-丰眼)为数据库名称, db = client.seventeen_zwd db_name = "shop_type_message" for i in list(db[db_name].find()): print json.dumps(i["type"]) data = [] data.append(i["url"]) data.append(json.dumps(i["type"], ensure_ascii=False)) data.append(json.dumps(i["market"], ensure_ascii=False)) data.append(i["address"]) writer.list_line(data) writer.close()
#!/usr/bin/env python """ This example shows that you can write bad data into TSV files using the 'tsv' module """ import tsv writer = tsv.TsvWriter(open("/tmp/file.tsv", "w")) writer.line("\t\t\t", "Column 2", 12345) writer.close()
def run(options): """ Do the actual work of the program. """ # Set up our BED output writer = tsv.TsvWriter(options.output_bed) # Read the PSL for result in Bio.SearchIO.parse(options.input_psl, "blat-psl"): for hit in result: for hsp in hit: # A hit is the equivalent of a PSL line; we're going to make it a BED record # Pull out the block sizes and starts block_sizes = list(hsp.hit_span_all) block_starts = list(hsp.query_start_all) # We need to find out the strand strand = "+" for fragment in hsp: # Loop over all the fragments. We know they'll all be on the # same strand, because that's how PSL can articulate them. if fragment.query_strand == -1: # We ought to be on the - strand. This means our blocks # are going to be in backwards order. strand = "-" block_sizes.reverse() block_starts.reverse() break # Convert starts to be relative to aligned region block_starts = [x - hsp.query_start for x in block_starts] # The first should always be 0 assert(block_starts[0] == 0) # BED is: chrom, chromStart, chromEnd, name, score, strand, # thickStart, thickEnd, itemRgb, blockCount, blockSizes, # blockStarts bed_record = [ # chrom options.contig or result.id, # chromStart hsp.query_start + options.offset, # chromEnd hsp.query_end + options.offset, # name hit.id, # score 0, # strand strand, # thickStart hsp.query_start + options.offset, # thickEnd hsp.query_start + options.offset, # itemRgb "0,0,0", # blockCount len(hsp), # blockSizes ",".join((str(x) for x in block_sizes)), # blockStarts ",".join((str(x) for x in block_starts)) ] writer.list_line(bed_record)
def main(): # Wehre do we find receipts to import INBOX_PATH = 'Inbox' # Where do we put them? DATABASE_PATH = 'Database' if not os.path.exists(INBOX_PATH): os.makedirs(INBOX_PATH) if not os.path.exists(DATABASE_PATH): os.makedirs(DATABASE_PATH) # All imported receipts will be assigned to today, no matter when they were made date_string = time.strftime('%Y/%m/%d') # Determine where to put today's receipts dest_dir = os.path.join(DATABASE_PATH, date_string) # Find all the inbox files inbox_files = list(os.listdir(INBOX_PATH)) # Find all the TXTs txt_files = [f for f in inbox_files if f.lower().endswith('.txt')] # And the PDFs pdf_files = [f for f in inbox_files if f.lower().endswith('.pdf')] # TODO: Check for duplicates in different cases of the same extension # File them by basename txt_by_basename = {f[:-4]: f for f in txt_files} pdf_by_basename = {f[:-4]: f for f in pdf_files} # Count imported receipts import_count = 0 for basename in txt_by_basename.keys(): if basename not in pdf_by_basename: print("Warning: {} exists but PDF is missing. Skipping!".format( txt_by_basename[basename])) # Find the pair of files txt_filename = os.path.join(INBOX_PATH, txt_by_basename[basename]) pdf_filename = os.path.join(INBOX_PATH, pdf_by_basename[basename]) if not os.path.exists(dest_dir): # Make sure the destination directory exists os.makedirs(dest_dir) # Get a new unique receipt ID # This is super N^2, but you should not have large N receipts for one day. receipt_id = get_new_id(dest_dir) # Where do we put the raw OCR text dest_txt_filename = os.path.join(dest_dir, '{}.txt'.format(receipt_id)) # And the processed items dest_tsv_filename = os.path.join(dest_dir, '{}.tsv'.format(receipt_id)) # And the PDF dest_pdf_filename = os.path.join(dest_dir, '{}.pdf'.format(receipt_id)) # Count the items item_count = 0 # And the price total_price = 0.0 # Create the TSV with open(txt_filename, 'r') as text_in: with open(dest_tsv_filename, 'w') as tsv_out: # Prepare a TSV writer writer = tsv.TsvWriter(tsv_out) for item, price in parse_items(text_in): # Save each item and its price writer.line(item, str(price)) item_count += 1 if not math.isnan(price): total_price += price # Move the other files move(txt_filename, dest_txt_filename) move(pdf_filename, dest_pdf_filename) print('Imported {} items with total price {} as {}'.format( item_count, total_price, receipt_id)) import_count += 1 print("Imported {} receipts".format(import_count))
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('original_input', help="Input file", type=argparse.FileType('r')) parser.add_argument('--seq2sick', default=False, action="store_true") parser.add_argument('--wordadver', default=False, action="store_true") parser.add_argument('--extra_input', help="Input file", type=argparse.FileType('r')) parser.add_argument('--output_distance_type', type=str, default="infersent-cosine") parser.add_argument('--input_distance_type', type=str, default="infersent-cosine") parser.add_argument('--save', default=False, action="store_true") parser.add_argument('--plot', default=False, action="store_true") parser.add_argument('--get_bootstrap', default=False, action="store_true") parser.add_argument('--get_samples', default=False, action="store_true") args = parser.parse_args(arguments) print(args) if args.seq2sick: lines = args.original_input.readlines() seq2sick_lines = args.extra_input.readlines() real_ins = [] real_outs = [] adv_ins = [] adv_outs = [] for line, seq2sickline in zip(lines, seq2sick_lines): adversarial_input, adversarial_output, normal_output = seq2sickline.split( "\t") adversarial_input = adversarial_input.strip() adversarial_output = adversarial_output.strip() normal_output = normal_output.strip() line = line.strip() adv_ins.append(adversarial_input) adv_outs.append(adversarial_output) real_outs.append(normal_output) real_ins.append(line) elif args.wordadver: lines = args.original_input.readlines() reals = lines[0:][::2] adversarials = lines[1:][::2] real_ins, real_outs = process_lines(reals) adv_ins, adv_outs = process_lines(adversarials) else: raise NotImplementedError if args.get_bootstrap: gain_boostrap_mean, gain_boostrap_std = calculate_real_gain( real_ins, real_outs, 1000, args.input_distance_type, args.output_distance_type, model) print("Real bootstrap region: {} +/- {}".format( gain_boostrap_mean, gain_boostrap_std)) adversarial_boostrap_mean, adversarial_boostrap_std = calculate_adversarial_gain( real_ins, real_outs, adv_ins, adv_outs, args.input_distance_type, args.output_distance_type, model) print("Adversarial Bootstrap region: {} +/ {}".format( adversarial_boostrap_mean, adversarial_boostrap_std)) # TODO: generate graph from some real samples if args.plot: input_distances, output_distances, gains = calculate_adversarial_gain_details( real_ins, real_outs, adv_ins, adv_outs, args.input_distance_type, args.output_distance_type, model) gains = gains gains = normalize_array(np.clip(np.array(gains), -100, 100)) sizes = [np.pi * (4.0 + x * 10)**2 for x in gains] # cmap = plt.cm.rainbow cmap = plt.get_cmap('inferno') norm = matplotlib.colors.Normalize() colors = cmap(norm(gains)) fig = plt.figure(figsize=(16, 8)) ax = fig.add_subplot(1, 1, 1) plt.xlim(xmax=.08) plt.ylim(ymax=1.01, ymin=-.01) axis_font = {'fontname': 'Arial', 'size': '32'} for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontname(axis_font['fontname']) label.set_fontsize(axis_font['size']) plt.title("Seq2Sick Adversary on GigaWord", **axis_font) plt.xlabel("Input Distance" + "\n" + "(InferSent-Cosine)", **axis_font) plt.ylabel(r"Output Distance" + "\n" + "(InferSent-Cosine)", **axis_font) plt.scatter(input_distances, output_distances, s=sizes, c=colors, alpha=0.65, edgecolors='none') if args.save: fig.savefig('seq2sick_adversary.pdf', dpi=fig.dpi, bbox_inches='tight') else: plt.show() if args.get_samples: input_distances, output_distances, gains = calculate_adversarial_gain_details( real_ins, real_outs, adv_ins, adv_outs, args.input_distance_type, args.output_distance_type, model) import tsv most_gain_samples = np.array(gains).argsort()[:][::-1] writer = tsv.TsvWriter(open("samples.tsv", "w")) writer.list_line([ "input", "output", "adv_input", "adv_output", "d_in", "d_out", "gain" ]) for z in most_gain_samples: col = (real_ins[z], real_outs[z], adv_ins[z], adv_outs[z], input_distances[z], output_distances[z], gains[z]) writer.list_line(col) writer.close()
def main(args): """ Parses command line arguments and do the work of the program. "args" specifies the program arguments, with args[0] being the executable name. The return value should be used as the program's exit code. """ if len(args) == 2 and args[1] == "--test": # Run the tests return doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) options = parse_args(args) # This holds the nicely-parsed options object # Set Entrez e-mail Entrez.email = options.email # Go get the region of the reference we're talking about. Starts and ends # are 1-based. ref_acc, ref_start, ref_end = get_region_info(options.region, options.assembly_url) # Make our output directory if not os.path.exists(options.region): os.makedirs(options.region) # We're going to write a chrom.sizes file with accessions (not the GI # numbers) for the gff3->psl conversion step acc_chrom_sizes = tsv.TsvWriter( open(options.region + "/acc.chrom.sizes", "w")) # Get the reference's GI ref_gi = get_gi_number(ref_acc) print("Reference for {} is GI{}:{}-{} 1-based".format( options.region, ref_gi, ref_start, ref_end)) # Grab the reference sequence ref_seq = get_sequence(ref_gi, ref_start, ref_end) print("Got {}bp for a {}bp reference".format(len(ref_seq), ref_end - ref_start + 1)) if len(ref_seq) > ref_end - ref_start + 1: # Clip it down if it's too long. Assuming we have the correct sort of # coordinates, and that we got served the data starting at the correct # offset. ref_seq = ref_seq[0:ref_end - ref_start + 1] elif len(ref_seq) < ref_end - ref_start: raise RuntimeError("Didn't get enough sequence from the API!") # Change it to be just called "ref" ref_seq.id = "ref" # Write it to <region>/ref.fa SeqIO.write([ref_seq], open("{}/ref.fa".format(options.region), "w"), "fasta") # Write a chromosome size entry for the reference by its accession acc_chrom_sizes.line(ref_acc, get_length(ref_gi)) print("Writing genes for ref") # Make a BED to put reference genes in ref_bed = open_gene_bed(options.region, "ref") for line in get_genes(ref_acc, "ref", ref_start, ref_end): # Write all the BED lines for the appropriate region of the reference to # that file. ref_bed.write(line) ref_bed.close() for alt_acc, alt_unit in get_region_sequences(options.region, options.assembly_url): # For every alt in the region # Get its GI number alt_gi = get_gi_number(alt_acc) print("Downloading alt GI{}".format(alt_gi)) # Grab the sequence data alt_seq = get_sequence(alt_gi) # Write it to <region>/GI<number>.fa SeqIO.write([alt_seq], open("{}/GI{}.fa".format(options.region, alt_gi), "w"), "fasta") # Add this alt to the chromosome-sizes-by-accession file acc_chrom_sizes.line(alt_acc, get_length(alt_gi)) # Sneak into the TSV writer and flush, so the sizes file can now be # read. acc_chrom_sizes.stream.flush() # Where should we put the GFF alignment for this alt to the reference? alt_gff3 = "{}/GI{}.gff3".format(options.region, alt_gi) print("Downloading alignment") # Go download it download_gff3(ref_acc, alt_acc, alt_unit, options.assembly_url, alt_gff3) # And we need to convert that to PSL alt_psl = "{}/GI{}.psl".format(options.region, alt_gi) print("Converting to PSL") # Run the conversion with the bit of the sizes file we have so far. We # need to pass the chrom.sizes file twice now because gff3ToPsl has # changed its interface. subprocess.check_call([ "gff3ToPsl", options.region + "/acc.chrom.sizes", options.region + "/acc.chrom.sizes", alt_gff3, alt_psl ]) # Edit the output to point to the GI instead of the accession subprocess.check_call( ["sed", "-i", "s/{}/GI{}/g".format(alt_acc, alt_gi), alt_psl]) print("Writing genes for GI{}".format(alt_gi)) # Make a BED to put alt genes in alt_bed = open_gene_bed(options.region, "GI{}".format(alt_gi)) for line in get_genes(alt_acc, "GI{}".format(alt_gi), alt_parent_grc_id=ref_acc): # Write all the BED lines for the alt to the file alt_bed.write(line) alt_bed.close() # Now we need to do psl2maf, complete with globbing. print("Creating GRC MAF") # Find the psl2maf.py script psl2maf = (os.path.dirname(os.path.realpath(__file__)) + "/psl2maf.py") # Go call psl2maf, moving the reference stuff over to "ref" and shifting it # back so that the first base we clipped out of the reference is 0, # splitting apart mismatches, and making sure to use all the PSLs and MAFs # in our output directory. We make sure to add 1 to the reference start in # the offset, because some basedness-conversion needs to happen. TODO: Make # this a function or make this use an import or somehow de-uglify it. args = ([ psl2maf, "--maf", options.region + "/GRCAlignment.maf", "--referenceOffset", str(-ref_start + 1), "--referenceSequence", "ref", "--noMismatch", "--psls" ] + glob.glob(options.region + "/*.psl") + ["--fastas"] + glob.glob(options.region + "/*.fa")) print("Calling: {}".format(" ".join(args))) subprocess.check_call(args)
def mutual_information_statistics(layers, layer_names, ctx, options): """ For every binary or continuous layer and for every layout, we need a mi_<layer number>_<layout number>.tab file, with scores associating that layer with other layers of its type, in <other layer name>\t<score> format. This uses mutual information (really, normalized redundancy) instead of the statistical tests above to produce a score between 0 and 1, with higher being more mutual information. """ # We're going to need a mapping from layer name to layer index. layer_indices = {name: i for i, name in enumerate(layer_names)} for layout_index in ctx.all_hexagons.iterkeys(): # We look at all layouts for this. # We assume layout doesn't somehow change layer types. # Get windows in this layout. Doesn't really matter what order they're # in, since we only compare within layouts. Keep the threshold used # above. curated_windows = window_tool( options.mi_window_size, options.mi_window_size, ctx, threshold=options.mi_window_threshold, layout_index=layout_index ) # This will hold per-window discrete values for each layer for which we # are computing mutual information statistics. For binary layers, it is # the sum of the number of ones in the window. For non-binary layers, it # is the histogram bin number in which the window's average value falls, # or a past-the-last-bin number for an empty window with a NaN average. # This is indexed by layer name, referencing the window values for that # layer. layer_window_values = {} for layer_name in ctx.binary_layers: # For binary layers, get the sum for each window. But use 0 for # hexes that don't have values in a certain layer. Also # (re)discretize by binning as for continuous below. # This holds sums for each layer, for each window. window_sums = [ sum ( ( layers[layer_name][hexagon] if layers[layer_name].has_key(hexagon) else 0 ) for hexagon in window ) for window in curated_windows ] """ window_sums = [sum((layers[layer_name][hexagon] if layers[layer_name].has_key(hexagon) else 0) for hexagon in window) for window in curated_windows] """ #print 'window_sums:', len(window_sums) #pprint.pprint(window_sums) if options.mi_binary_binning: # We want to bin counts # Now we have our list of the sum values for each window. # Histogram bin the non-NaN values. See # <https://gist.github.com/elsonidoq/4230222> _, bins = numpy.histogram( [ total for total in window_sums if not math.isnan(total) ] ) #_, bins = numpy.histogram([total for total in window_sums # if not math.isnan(total)]) # Work out the bin numbers for all the totals (NaN windows get the # past-the-end bin) layer_window_values[layer_name] = numpy.digitize(window_sums, bins) else: # Don't bin counts. layer_window_values[layer_name] = window_sums """ TODO skip continuous for now for layer_name in ctx.continuous_layers: # For continuous layers, get the average for each window, but # discretize using histogram bin number. # This holds averages for each window. window_averages = [] for window in curated_windows: # Compute the sum of the layer in the window window_sum = 0 # And the number of hexes with values involved window_values = 0 for hexagon in window: if layers[layer_name].has_key(hexagon): # Sum up over all the hexagons in this window with # values for this layer window_sum += layers[layer_name][hexagon] window_values += 1 if window_values == 0: # Can't take the average Use NaN window_averages.append(float("NaN")) else: # Use the average like we're supposed to # TODO: do we need float() here? window_averages.append(float(window_sum) / window_values) # Now we have our list of the average values for each window. # Histogram bin the non-NaN values. See # <https://gist.github.com/elsonidoq/4230222> _, bins = numpy.histogram([average for average in window_averages if not math.isnan(average)]) # Work out the bin numbers for all the averages (NaN windows get the # past-the-end bin) layer_window_values[layer_name] = numpy.digitize(window_averages, bins) """ pairs_to_run = len(layer_window_values) ** 2 - len(layer_window_values) # without compare to self print timestamp(), "{} pairs to run".format(pairs_to_run) # What layer are we writing the file for? current_first_layer = None # Where are we writing it to? information_writer = None # How many pairs have we done? pair = 0 #print('layer_window_values') #pprint.pprint(layer_window_values); message_count = 1 for (layer_a, layer_b, redundancy) in mutualInfo.all_pairs ( layer_window_values): # Go get mutual information for each pair of layers, grouped by the # first layer. if layer_a != current_first_layer: # We're changing first layers. if information_writer is not None: # Close the previous file. information_writer.close() # Open a tsv writer for the new first layer's redundancies with # everyone else. information_writer = tsv.TsvWriter(open(os.path.join( options.directory, "mi_{}_{}.tab".format(layout_index, layer_indices[layer_a])), "w")) # Record that we're on that layer as the first layer now. current_first_layer = layer_a # Make a line for redundancy with this other layer. information_writer.line(layer_b, str(redundancy)) pair += 1 # Log a progress message for every ~1/30th of pairs processed if pair > pairs_to_run * message_count / 30: print timestamp(), str(message_count) + '/30 of', pairs_to_run, 'pairs' sys.stdout.flush() message_count += 1 print timestamp(), "{} pairs processed".format(pair) if information_writer is not None: # Close the last file. information_writer.close()
import tsv print('\n\n\t\tScraping Data to text file...\n\n') data = open( 'E:/Freelancing Project/abusive/Abusive_Language_Detector/Abusive_Language_Detector/data/text.txt', 'r', encoding='utf8').read() data = data.replace('ред', '.') print('Scraped Data: ', data) blob = TextBlob(data) workbook = xlsxwriter.Workbook('../output/output.xlsx') worksheet = workbook.add_worksheet() writer = tsv.TsvWriter(open("../data/test.tsv", "w")) writer.line("test_id", "comment") worksheet.write(0, 0, 'Content ID') worksheet.write(0, 1, 'Bangla Text') worksheet.write(0, 2, 'English Text') worksheet.write(0, 3, 'Prediction') print('\n\n\t\tSplitting Data to .tsv file...\n\n') i = 1 print('For each sentence in paragraph:\n\n') for sentence in blob.sentences: en = str(sentence.translate(to='en')) print(i, '. ', sentence, ' - ', en) worksheet.write(i, 0, i)
def main(args): ##### STEP 1: CNN architecture ##### cnn = CNN() print(cnn) if is_cuda: cnn.cuda() optimizer = torch.optim.Adam(cnn.parameters(), lr=LR) # optimize all cnn parameters loss_func = nn.BCEWithLogitsLoss() # OR Hamming Loss # Alternatives: Focal Lost for imbalanced data: https://gombru.github.io/2018/05/23/cross_entropy_loss/ # Also see: https://discuss.pytorch.org/t/how-to-implement-focal-loss-in-pytorch/6469/17 ##### STEP 2: Data set ##### all_data = CustomDatasetFromImages(args.data) train_data = all_data # Data Loader for easy mini-batch return in training, the image batch shape will be (50, 1, 28, 28) train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True) test_data = CustomDatasetFromImages(args.testData) all_x = Variable(torch.unsqueeze(all_data.image_tensors, dim=1)).type(torch.FloatTensor) test_x = Variable(torch.unsqueeze(test_data.image_tensors, dim=1)).type(torch.FloatTensor) # test_y = Variable(torch.unsqueeze(test_data.labels, dim=1)).type(torch.FloatTensor) ##### STEP 3: Training and testing ##### for epoch in range(EPOCH): for step, ( char_index, char, img_as_tensor, label_index_tensor, label) in enumerate( train_loader ): # gives batch data, normalize x when iterate train_loader if is_cuda: label_index_tensor = label_index_tensor.cuda() img_as_tensor = img_as_tensor.cuda() input = img_as_tensor output, embeddings = cnn(input) # cnn output loss = loss_func(output, label_index_tensor) # loss function optimizer.zero_grad() # clear gradients for this training step loss.backward() # backpropagation, compute gradients optimizer.step() # apply gradients if step % 50 == 0 or step == 148: if is_cuda: test_x = test_x.cuda() test_output, last_layer = cnn(test_x) pred_y = torch.max(test_output.cpu(), 1)[1].data.numpy() correct = 0 for i in range(len(test_data.labels)): labels = test_data.labels[i] if (pred_y[i] in labels): correct += 1 accuracy = float(correct) / float(len(test_data.labels)) print('Epoch: ', epoch, step, '| train loss: %.4f' % loss.data.cpu().numpy(), '| test accuracy: %.3f' % accuracy) if is_cuda: all_x = all_x.cuda() test_output, embeddings = cnn(all_x) # Comment out if we need to save embeddings during the training process # with open(str(epoch) + "_" + args.output +".pkl", 'wb') as output: # Overwrites any existing file. # if is_cuda: # embeddings = embeddings.cpu() # pickle.dump(embeddings, output, pickle.HIGHEST_PROTOCOL) # print("Embedding saved") writer = tsv.TsvWriter(open(args.output, "w")) output = embeddings.cpu() for idx, line in enumerate(output): if idx % 100 == 0: print(idx) s = "" for n in line: n = n.detach().numpy() s += str(n) + "\t" writer.line(s) writer.close() print("Embedding saved")
# Trial #2 file_read = '../f_model2.pickle.gz' # file_write = 'embedding_2.tsv' file_write = 'embedding_grey_fc2.tsv' f = gzip.open(file_read) # 'r' for reading; can be omitted mydict = pickle.load(f) # load file content as mydict f.close() print mydict.keys() print mydict['input_font_bottleneck.W'] print len(mydict['input_font_bottleneck.W']) print mydict['input_font_bottleneck.W'][0] print len(mydict['input_font_bottleneck.W'][0]) # print mydict['output_sigmoid.W'] # print mydict['output_sigmoid.W'].shape # # print len(mydict['dense_0.b']) # # print mydict['dense_0.b'][0] # # print len(mydict['dense_0.b'][0]) writer = tsv.TsvWriter(open(file_write, "w")) for row in mydict['input_font_bottleneck.W']: writer.list_line(row) writer.close()
def process_raw_data(raw_data, old_html_dir, options): """ This function receives the file containing raw genomic data that the user wants to map to the pre-existing visulization & the location of pre-existing visualization files. We will parse this new data file placing the rows in an order defined by the genes tab from the pre-existing visualization. This way we generate a mutable numpy matrix of raw patient data and have the genes in the required by the transform matrix, U^T, & S matrices. """ # Create the file paths for the required files genes_file_loc = os.path.join(old_html_dir, "genes.tab") s_matrix_file_loc = os.path.join(old_html_dir, "S.tab") u_t_matrix_file_loc = os.path.join(old_html_dir, "U_T.tab") beta_matrix_file_loc = os.path.join(old_html_dir, "beta.tab") assignments_file_loc = os.path.join(old_html_dir, "assignments0.tab") # First open the genes file. genes_reader = tsv.TsvReader(open(genes_file_loc, 'r')) # This holds an iterator over lines in that file genes_iterator = genes_reader.__iter__() # Extract data type of the pre-existing visualization & the list of genes old_data_type = genes_iterator.next() print("Previous Data Type", old_data_type) # First see of the new data and the old data are of compatible data types new_data_type = options.type old_genes_list = [] # If they are the same data type add the genes to a python list if old_data_type[0] == new_data_type: print("Same Data Types") old_genes_list = genes_iterator.next() genes_reader.close() # First open the raw data file. raw_data_reader = tsv.TsvReader(open(raw_data, 'r')) # This holds an iterator over lines in that file raw_data_iterator = raw_data_reader.__iter__() sample_names = raw_data_iterator.next() sample_names = sample_names[1:] num_samples = len(sample_names) new_genes_list = [] for row in raw_data_iterator: new_gene = row[0] new_genes_list.append(new_gene) raw_data_reader.close() # Get the number of new samples & number of old genes to create # a new numpy data matrix print("Number of New Samples:", num_samples) num_new_genes = len(new_genes_list) print("Number of New genes:", num_new_genes) # Re-Initialize the data iterator # This holds an iterator over lines in that file raw_data_reader = tsv.TsvReader(open(raw_data, 'r')) raw_data_iterator = raw_data_reader.__iter__() # Skip the first line which is simple a row of headers raw_data_iterator.next() # Next we have to dump all the valus from the file into a numpy matrix # The values will be unsorted. We will then have to sort the rows of the # numpy matrix according to the order prescribed by old_genes_list raw_data_matrix_unsorted = numpy.zeros(shape=(num_new_genes, num_samples)) for rindex, row in enumerate(raw_data_iterator): # Cut off the first value of each row. It is simply the gene name. only_values = row[1:] # Place the data from only_values into the appropriate row in # raw_data_matrix. for cindex, col in enumerate(only_values): raw_data_matrix_unsorted[rindex][cindex] = only_values[cindex] # For every gene in old_genes_list search the new_genes_list for the # the appropriate index. Then use this index to find the values in # the unsorted data matrix and copy them a new sorted matrix. # This new matrix will be used the compute the (x,y) coordinates # needed to map the new samples. num_old_genes = len(old_genes_list) #Debugging num_no_data = 0 raw_data_matrix_sorted = numpy.zeros(shape=(num_old_genes, num_samples)) for rindex, gene in enumerate(old_genes_list): # Find the index of the desired gene in the new_genes_list # This index will corrrespond to the row in the raw_data_matrix_unsorted # that we want to extract and place in the raw_data_matrix_sorted try: gene_index = new_genes_list.index(gene) extracted_data_row = raw_data_matrix_unsorted[gene_index] # Iterate over the extracted row to place the values in the appropriate row # of the sorted data matrix. for cindex, col in enumerate(extracted_data_row): raw_data_matrix_sorted[rindex][ cindex] = extracted_data_row[cindex] except ValueError: num_no_data += 1 print("Number of genes with no data", num_no_data) # Open up S matrix, U^T, and Betas for x,y coordinate computation # First open the matrix file. s_reader = tsv.TsvReader(open(s_matrix_file_loc, 'r')) u_t_reader = tsv.TsvReader(open(u_t_matrix_file_loc, 'r')) beta_reader = tsv.TsvReader(open(beta_matrix_file_loc, 'r')) # Next create iterators to traverse the files s_iterator = s_reader.__iter__() u_t_iterator = u_t_reader.__iter__() beta_iterator = beta_reader.__iter__() # Create an array for s_values & create a diagonal matrix from it s_values = s_iterator.next() float_s_values = [] for value in s_values: v = float(value) float_s_values.append(v) s_values = float_s_values print("S_values", s_values) s_diag = numpy.diag(s_values) print(s_diag) # Create a numpy matrix for u_t (number of principal components * number of genes) u_t = numpy.zeros(shape=(len(s_values), num_old_genes)) for rindex, row in enumerate(u_t_iterator): for cindex, col in enumerate(row): u_t[rindex][cindex] = float(row[cindex]) # Create a numpy matrix for the betas (number of principal components * 2) betas = numpy.zeros(shape=(len(s_values), 2)) for rindex, row in enumerate(beta_iterator): for cindex, col in enumerate(row): betas[rindex][cindex] = float(row[cindex]) betas = numpy.transpose(betas) # Compute new coordinates coords = betas * (numpy.asmatrix(s_diag) * numpy.asmatrix(u_t) * numpy.asmatrix(raw_data_matrix_sorted)) print("Coordinates") print(coords) coords = numpy.transpose(coords) # Add to existing "assignments.tab" file assignments_writer = tsv.TsvWriter(open(assignments_file_loc, 'a')) for rindex, sample in enumerate(sample_names): print("Cindex", cindex) x = str(coords[rindex, 0]) y = str(coords[rindex, 1]) print(sample, x, y) assignments_writer.line(sample, x, y) assignments_writer.close() else: raise Exception("Pre-existing Visualization employs ", old_data_type, " data. Data to me mapped is of ", new_data_type, ". Data Types must be the same.") return True
def main(): cnn = CNN(300) #embedding_size print(cnn) # net architecture cnn.cuda() # torch.manual_seed(1) # reproducible # Hyper Parameters EPOCH = 5 # train the training data n times, to save time, we just train 1 epoch BATCH_SIZE = 64 LR = 1e-3 # learning rate # CLASSES = train_data = CustomDatasetFromImages(args.data) all_data = CustomDatasetFromImages(args.data) print("All:", all_data.data_len) # Cat: 123 # Total:7351 # plot one example # print(train_data.size()) # (60000, 28, 28) # plt.imshow(train_data.train_data[0].numpy(), cmap='gray') # plt.title('%i' % train_data.train_labels[0]) # plt.show() # Data Loader for easy mini-batch return in training, the image batch shape will be (50, 1, 28, 28) train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True) # pick 2000 samples to speed up testing test_data = CustomDatasetFromImages("data/VC/test.json") all_x = Variable(torch.unsqueeze(all_data.image_tensors, dim=1)).type(torch.FloatTensor) test_x = Variable(torch.unsqueeze(test_data.image_tensors, dim=1)).type(torch.FloatTensor) test_y = Variable(torch.unsqueeze(test_data.labels, dim=1)).type(torch.FloatTensor) # test_x = torch.unsqueeze(test_data.test_data, dim=1).type(torch.FloatTensor)[:2000]/255. # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1) # test_y = test_data.test_labels[:2000] optimizer = torch.optim.Adam(cnn.parameters(), lr=LR) # optimize all cnn parameters loss_func = nn.CrossEntropyLoss() # the target label is not one-hotted # following function (plot_with_labels) is for visualization, can be ignored if not interested from matplotlib import cm # try: from sklearn.manifold import TSNE; HAS_SK = True # except: HAS_SK = False; print('Please install sklearn for layer visualization') def plot_with_labels(lowDWeights, labels): plt.cla() X, Y = lowDWeights[:, 0], lowDWeights[:, 1] for x, y, s in zip(X, Y, labels): c = cm.rainbow(int(255 * s / 9)); plt.text(x, y, s, backgroundcolor=c, fontsize=9) plt.xlim(X.min(), X.max()); plt.ylim(Y.min(), Y.max()); plt.title('Visualize last layer'); plt.show(); plt.pause(0.01) plt.ion() # training and testing for epoch in range(EPOCH): for step, (char_index, char, img_as_tensor, label_index, label) in enumerate(train_loader): # gives batch data, normalize x when iterate train_loader input = img_as_tensor.cuda() label_index = label_index.cuda() # print(input.is_cuda) output, embeddings = cnn(input) # cnn output # print(output.is_cuda, label_index.is_cuda) loss = loss_func(output, label_index) # cross entropy loss optimizer.zero_grad() # clear gradients for this training step loss.backward() # backpropagation, compute gradients optimizer.step() # apply gradients if step % 50 == 0: test_x = test_x.cuda() test_output, last_layer = cnn(test_x) pred_y = torch.max(test_output.cpu(), 1)[1].data.numpy() accuracy = float((pred_y == test_y.data.numpy()).astype(int).sum()) / float(test_y.size(0)) print('Epoch: ', epoch, step, '| train loss: %.4f' % loss.data.cpu().numpy(), '| test accuracy: %.2f' % accuracy) # if HAS_SK: # # Visualization of trained flatten layer (T-SNE) # tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) # plot_only = 500 # low_dim_embs = tsne.fit_transform(last_layer.data.numpy()[:plot_only, :]) # labels = test_y.numpy()[:plot_only] # plot_with_labels(low_dim_embs, labels) all_x = all_x.cuda() test_output, embeddings = cnn(all_x) with open(str(epoch) + "_" + args.output +".pkl", 'wb') as output: # Overwrites any existing file. embeddings = embeddings.cpu() print(embeddings.is_cuda) pickle.dump(embeddings, output, pickle.HIGHEST_PROTOCOL) print("Embedding saved") plt.ioff() writer = tsv.TsvWriter(open(args.output + ".tsv", "w")) output = embeddings.cpu() for idx, line in enumerate(output): if idx%100 == 0: print(idx) s = "" for n in line: n = n.detach().numpy() s += str(n) + "\t" writer.line(s) writer.close() print("Embedding saved")