def summarise(self): """scan the combined CSV file and total up the number of error, warning etc. also record the total number of "events" per component.""" self.events = {} self.components = {} if self.limit > 0: csv.field_size_limit(self.limit) reader = csv.reader(open(self.csv, "rb")) for row in reader: event = row[0] if event == "info" and row[2] == "version": self.raptor_version = row[3] continue if event in self.events: self.events[event] += 1 else: self.events[event] = 1 bldinf = row[1] if bldinf in self.components: self.components[bldinf] += 1 else: self.components[bldinf] = 1 if self.verbose: for (event, count) in self.events.items(): print("{0} : {1}".format(event, count)) print("{0} components".format(len(self.components)))
def parseCSVfile(filename): global no_of_printed_st global no_of_empty_st csv.field_size_limit(10000000) # because we deal with huge csv file with open(filename, 'rb') as data: reader = csv.reader(data) try: try: # create a new file or **overwrite an existing file**. new_file = filename.rstrip('.csv') + ".txt" print new_file f = open(new_file, "w") except IOError: pass try: for row in reader: st_str = "".join(row) # stack trace as string st_list = st_str.split("\n") # in list st_dict = keepSTIntoDict(st_list) # in dictionary r_st_list = reverseSTDictValues(st_dict) # reversed values for each key # print the stack trace only if the reversed list is not empty if isr_st_listEmpty(r_st_list) == False: printValidST(f, r_st_list) checkUnknownExceptionExistence(r_st_list) no_of_printed_st = no_of_printed_st + 1 # increase the number of printed stack traces else: no_of_empty_st = no_of_empty_st + 1 # increase the number of empty stack traces finally: f.close() except csv.Error as e: sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
def __init__(self, f, encoding='utf-8', maxfieldsize=None, **kwargs): f = UTF8Recoder(f, encoding) self.reader = csv.reader(f, **kwargs) if maxfieldsize: csv.field_size_limit(maxfieldsize)
def readVCF(self, data): print "interpreting as a VCF file" csv.field_size_limit(1000000000) # load a tab delimited file of SNPs -> genotypes (like 23andme generates) reader = csv.reader(data, delimiter='\t') print "length of file: ", reader.line_num i = 0 for row in reader: # print i i = i + 1 if '#' in row[0]: print row continue if (len(row) == 10): rsid = row[2] ref = row[3] alt = row[4] data = row[9] allele1 = data[0] allele2 = data[2] genotype = "" if allele1[0] == '0': genotype = genotype + ref elif allele1[0] == '1': genotype = genotype + alt else: print allele1, " issue with ", rsid print row if allele2[0] == '0': genotype = genotype + ref elif allele2[0] == '1': genotype = genotype + alt else: print allele2, " issue with ", rsid print row self.SNPs[rsid] = genotype # set SNP -> genotype value print "done reading VCF file"
def writetest(Xpreds, fil='testresultsNN.csv') : import csv csv.field_size_limit(1000000000) outwriter = csv.writer(open(fil,'w'),delimiter=",") rows = np.arange(0,len(Xpreds)) for row in rows : outwriter.writerow([row+1,Xpreds[row]])
def readCSV(resultfile): csv.field_size_limit(sys.maxint) csvReader = csv.reader(open(resultfile,"rb"),delimiter=',') results = [] for row in csvReader: results.append(row) return (results[0],results[1:])
def Load(self, kind, data): """Parses CSV data, uses a Loader to convert to entities, and stores them. On error, fails fast. Returns a "bad request" HTTP response code and includes the traceback in the output. Args: kind: a string containing the entity kind that this loader handles data: a string containing the CSV data to load Returns: tuple (response code, output) where: response code: integer HTTP response code to return output: string containing the HTTP response body """ Validate(kind, basestring) Validate(data, basestring) output = [] try: loader = Loader.RegisteredLoaders()[kind] except KeyError: output.append('Error: no Loader defined for kind %s.' % kind) return (httplib.BAD_REQUEST, ''.join(output)) buffer = StringIO.StringIO(data) reader = csv.reader(buffer, skipinitialspace=True) try: csv.field_size_limit(800000) except AttributeError: pass return self.LoadEntities(self.IterRows(reader), loader)
def _get_city_db(): csv.field_size_limit(sys.maxsize) cities_file = os.path.join(os.path.dirname(__file__), 'cities.txt') with open(cities_file, 'rt') as f: r = csv.reader(f, delimiter='\t') city_db = list(r) return city_db
def loadTraces(fileName): """ Load netwrok traces from CSV :param fileName: (str) name of the file :return traces: (dict) network traces. E.g: activeCells, sensorValues, etc. """ csv.field_size_limit(sys.maxsize) with open(fileName, 'rb') as fr: reader = csv.reader(fr) headers = reader.next() traces = dict() for field in headers: traces[field] = [] for row in reader: for i in range(len(row)): if len(row[i]) == 0: data = [] else: if headers[i] in ['tmPredictedActiveCells', 'tpActiveCells', 'tmActiveCells']: if row[i] == '[]': data = [] else: data = map(int, row[i][1:-1].split(',')) else: data = float(row[i]) traces[headers[i]].append(data) return traces
def writetest(idx,Xpreds, fil='NN.512.256.64.csv') : import csv csv.field_size_limit(1000000000) outwriter = csv.writer(open(fil,'w'),delimiter=",") rows = np.arange(0,len(Xpreds)) for row in rows : outwriter.writerow([int(idx[row]),Xpreds[row]])
def main(train_file, test_file): #print "loading data.." csv.field_size_limit(1310720) trainreader = csv.reader (open( '/home/kiran/kdd/train.csv' )) projectid, traindata_old = zip (*trainreader) testreader = csv.reader (open ('/home/kiran/kdd/test.csv')) projectid, testdata_old = zip (*testreader) # remove stopwords traindata = [] testdata = [] for observation in traindata_old: traindata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False)) for observation in testdata_old: testdata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False)) tfv = CountVectorizer (binary=1,ngram_range=(1, 1)) X_all = traindata + testdata lentrain = len(traindata) tfv.fit(X_all) X_all = tfv.transform(X_all) X = X_all[:lentrain] X_test = X_all[lentrain:] scipy.io.mmwrite ('x_train_bin_1gram.mtx', X, field = 'real') scipy.io.mmwrite ('x_test_bin_1gram.mtx', X_test, field = 'real') myCols = tfv.get_feature_names () myCols = DataFrame (myCols) myCols.to_csv ('bin_1gram.csv', index=False)
def dic_gen(file): dict_info = {} csv.field_size_limit(1000000000) reader = csv.reader(open(file), delimiter = ' ') for row in reader: intron = row[0] coverage = int(row[1]) chr = row[2] strand = row[3] istart = row[4] iend = row[5] ilength = int(row[6]) dn = row[7] dn_type = row[8] dn_type_score = row[9] reads = row[10] total_introns.add((intron, chr, strand, istart, iend, ilength, dn, dn_type, dn_type_score)) dict_info[intron] = [coverage, ilength] #Para los cDNA_EST el ilenght es el coverage de EST return dict_info
def main(introns_final_table): csv.field_size_limit(1000000000) reader1 = csv.reader(open(introns_final_table), delimiter = ' ') dn_type = defaultdict(int) Total = 0 dns = [] for row in reader1: intron = row[0] dn = row[7] dn_type[dn] += 1 Total += 1 dns.append(dn) print "TOTAL =", Total print "Dinucleotide_TYPE", "Number", "%" dn_frec = dn_type.items() dn_frec.sort(key=lambda x: x[1]) for i in reversed(dn_frec): dn = i[0] frec = i[1] print dn, frec, percent(frec, Total)
def parsecsv(self, fname, upperbound = None): """Parse CSV file containing talk data. This should be replaceable with something that talks to the backing database containing the actual data. """ csv.field_size_limit(sys.maxsize) with open(fname, 'r') as csvfile: talkreader = csv.reader(csvfile) rownum = 0 header = [] results = [] for row in talkreader: result = {} if rownum == 0: header = row elif upperbound is not None and rownum > upperbound: break else: for i,column in enumerate(header): result[column] = row[i] result['text'] = self.parsebody(result['body']) results.append(result) rownum += 1 return results
def __tagProcessTask(filename, savefile): import csv import sys import os import time csv.field_size_limit(10000000) b = time.time() f = open(filename,'rU') col = dict((cn, i) for i, cn in enumerate(f.readline().split(','))) tagRules = __getTagRules(col) reader = csv.reader(f,quoting=csv.QUOTE_NONE) data = [] for line in reader: r = __tagProcess(tagRules, line) if r: data.extend(r) e = time.time() f.close() f = open(savefile,'w') f.writelines(['%s-%s-%d\n' %(d['userId'],d['tag'],d['tagId']) for d in data]) f.close() d = ["('%s',%d, now())" %(d['userId'],d['tagId']) for d in data] i = 0 c = len(d) #print c while i < c: s = ','.join(d[i:i+5000]) letv_db.executesql("INSERT INTO tagLog (userId,tagId, date) VALUES %s" %s) letv_db.commit() i += 5000 return 'all time: %f, data len: %d, csv line: %d' %((e-b), len(data), reader.line_num)
def raw(self, sample=False): def rows(): for line in self._sample: if PY2: yield line.encode('utf-8') else: yield line if not sample: for line in self.lines: if PY2: yield line.encode('utf-8') else: yield line # Fix the maximum field size to something a little larger csv.field_size_limit(256000) try: for row in csv.reader(rows(), dialect=self._dialect, **self._overrides): yield [Cell(to_unicode_or_bust(c)) for c in row] except csv.Error as err: if u'newline inside string' in unicode_string(err) and sample: pass elif u'line contains NULL byte' in unicode_string(err): pass else: raise messytables.ReadError('Error reading CSV: %r', err)
def scrubcsv(fnamein, fnameout, i): csv.field_size_limit(100000000) infile = csv.reader(open(fnamein, 'r')) outfile = csv.writer(open(fnameout, 'w')) for row in infile: if len(row) == i: outfile.writerow(row)
def reada(filename="index.csv"): csv.field_size_limit(1000000000) ##problem, Error: field larger than field limit (131072) ## http://lethain.com/entry/2009/jan/22/handling-very-large-csv-and-xml-files-in-python/ foor=read(filename) gci = get_column_index # augment with noteid to make augmented return [ [Note.objects.filter(owner=x[gci('owner_id')],jid=x[gci('jid')])[0].id] + x for x in foor if not x[gci('primary')] == '-no idea-']
def parse_uploaded(f): try: logging.info("siamo partiti cazzo") csv.field_size_limit(1000000000) # 1. getting file encoding result = chardet.detect(f.read()) encoding = result['encoding'] # 2. determing dialect f.open() sniffer = csv.Sniffer() dialect = sniffer.sniff(f.read()) dialect.delimiter = "\t" # 3. encoding file f.open() utf8_file = f.read().decode(encoding).encode('utf-8') reader = csv.DictReader( utf8_file.splitlines(), dialect=csv.excel_tab ) # 4. get results results = [row for row in reader] except Exception, e: logging.info(str(e))
def IntronExtractor(bed12): # row[0] chr # row[1] alignment start # row[2] alignment end # row[3] name # row[4] # row[5] strand # row[6] aligment start # row[7] aligment end # row[8] # row[9] blocknum # row[10] blocksizes # row[11] qstarts for row in csv.reader(open(bed12), delimiter = '\t'): csv.field_size_limit(1000000000) chr = row[0] start = row[1] end = row[2] strand = row[5] bn = int(row[9]) name = row[4] if strand=="+": print "\t".join([chr, start, start, name, str(0), strand]) elif strand=="-": print "\t".join([chr, end, end, name, str(0), strand])
def uploadFromFile(): dir = os.path.dirname(os.path.abspath(__file__)) filepath = os.path.join(dir, 'test.txt') f = open(filepath,"r") csv.field_size_limit(1000000000) # getting file encoding result = chardet.detect(f.read()) encoding = result['encoding'] # determing dialect f.seek(0) sniffer = csv.Sniffer() dialect = sniffer.sniff(f.read()) dialect.delimiter = "\t" # encoding file f.seek(0) utf8_file = f.read().decode(encoding).encode('utf-8') reader = csv.DictReader( utf8_file.splitlines(), dialect=csv.excel_tab ) rows = list(reader) # get results results = [] for i, row in enumerate(rows): results.append(row) progress = 100 * float(i)/float(len(rows)) current_task.update_state(state='PROGRESS', meta={'current': i, 'total': len(rows), 'progress': progress }) current_task.update_state(state='SUCCESS') return results
def DRcounter(file): reader = csv.reader(open(file), dialect='excel-tab' ) csv.field_size_limit(1000000000) for row in reader: SJ5=row[15] SJ3=row[16] L=len(SJ5)/2 SJ5U = SJ5[:L] SJ5D = SJ5[L:] SJ3U = SJ3[:L] SJ3D = SJ3[L:] DRU = 0 DRD = 0 if SJ5U==SJ3U: DRU = L else: while SJ5U[L-1-DRU]==SJ3U[L-1-DRU]: DRU = (DRU + 1) if SJ5U[L-1-DRU]!=SJ3U[L-1-DRU]: break if SJ5D==SJ3D: DRD = L else: while SJ5D[DRD]==SJ3D[DRD]: DRD = (DRD + 1) if SJ5D[DRD]!=SJ3D[DRD]: break print row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11], SJ5U, SJ5D, SJ3U, SJ3D, DRU, DRD, DRU+DRD
def do_insert_from_csv(insert, filename): import csv csv.field_size_limit(2**31) with open(filename, 'r', encoding="UTF=8") as commits_file: for row in tqdm(csv.reader(commits_file)): insert(*row)
def splitByScaff(pileUpFileName, opDir): csv.field_size_limit(sys.maxint) with open(pileUpFileName, 'r') as pileUpFile: pileUpReader = csv.reader(pileUpFile, delimiter='\t') prevScaffName = '' currScaffFile = None currScaffWriter = None for row in pileUpReader: scaffName = row[Pileup_Consts.SCAFF_NAME] scaffNum = int(scaffName[-5:]) if scaffNum > 818: continue if prevScaffName == scaffName: #continue to write in current scaff file currScaffWriter.writerow(row) else: #found a new scaff, close old file if currScaffFile is not None: currScaffFile.close() #open and write new file currScaffFile = open(\ os.path.join(opDir, scaffName + '.mpileup'), 'w') currScaffWriter = csv.writer(currScaffFile, delimiter='\t') currScaffWriter.writerow(row) prevScaffName = scaffName
def load_traces(file_name): """ Load network traces from CSV :param file_name: (str) name of the file :return traces: (dict) network traces. E.g: activeCells, sensorValues, etc. """ csv.field_size_limit(sys.maxsize) with open(file_name, 'rb') as fr: reader = csv.reader(fr) headers = reader.next() traces = dict() for field in headers: traces[field] = [] for row in reader: for i in range(len(row)): if row[i] == '': data = None else: data = json.loads(row[i]) traces[headers[i]].append(data) return traces
def csvload(fileName): csvfile = open(fileName, 'r') csv.field_size_limit(CSV_FILE_LIMIT) rdr = csv.reader(csvfile, dialect='excel', quotechar=str('"')) if not csv.Sniffer().has_header(csvfile.readline()): rdr.seek(0) return rdr, csvfile, fileName
def parse_csv(self, doc, delim=','): """ Csv reader ===== Function to read in a csv file Parameters ----- doc : str The name of the csv file Returns ----- lines : list of lists Each list corresponds to the cell values of a row """ csv.field_size_limit(sys.maxsize) try: lines = [] with open(doc, 'r', encoding = 'utf-8') as csvfile: csv_reader = csv.reader(csvfile, delimiter = delim) for line in csv_reader: lines.append(line) except: lines = [] csvfile = open(doc, 'r', encoding = 'utf-8') csv_reader = csv.reader(line.replace('\0','') for line in csvfile.readlines()) for line in csv_reader: lines.append(line) return lines
def get_uniprot_entrez_id_map(self): logger.info("Mapping Uniprot ids to Entrez/ENSEMBL gene ids") import sys id_map = {} file = '/'.join((self.rawdir, self.files['id-map']['file'])) with gzip.open(file, 'rb') as csvfile: csv.field_size_limit(sys.maxsize) filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: (uniprotkb_ac, uniprotkb_id, geneid, refseq, gi, pdb, go, uniref100, unifref90, uniref50, uniparc, pir, ncbitaxon, mim, unigene, pubmed, embl, embl_cds, ensembl, ensembl_trs, ensembl_pro, other_pubmed) = row if int(ncbitaxon) not in self.tax_ids: continue if geneid.strip() != '': idlist = re.split(r';', geneid) id_map[ uniprotkb_ac.strip()] = [ 'NCBIGene:'+i.strip() for i in idlist] elif ensembl.strip() != '': idlist = re.split(r';', ensembl) id_map[ uniprotkb_ac.strip()] = [ 'ENSEMBL:'+i.strip() for i in idlist] logger.info("Acquired %d uniprot-entrez mappings", len(id_map)) return id_map
def fake_import(request): dir = os.path.dirname(os.path.abspath(__file__)) filepath = os.path.join(dir, 'test.txt') f = open(filepath,"r") csv.field_size_limit(1000000000) # getting file encoding result = chardet.detect(f.read()) encoding = result['encoding'] # determing dialect f.seek(0) sniffer = csv.Sniffer() dialect = sniffer.sniff(f.read()) dialect.delimiter = "\t" # encoding file f.seek(0) utf8_file = f.read().decode(encoding).encode('utf-8') reader = csv.DictReader( utf8_file.splitlines(), dialect=csv.excel_tab ) rows = list(reader) # get results results = [] for i, row in enumerate(rows): results.append(row) response = create_objects('items', results) return HttpResponse(json.dumps(response, default=bson.json_util.default), mimetype="application/json")
def __init__(self, logfile, overwrite): if overwrite: self.file_object = open(logfile, 'w', 1) else: self.file_object = open(logfile, 'a', 1) csv.field_size_limit(sys.maxsize) self.log_file = csv.writer(self.file_object, delimiter=',', quotechar='|', escapechar='\\', quoting=csv.QUOTE_MINIMAL)
import csv, json, datetime as dt, time from pprint import pprint from tensorflow.keras.preprocessing.text import text_to_word_sequence as ttws csv.field_size_limit(500000) #https://github.com/first20hours/google-10000-english/blob/master/20k.txt #dataSource = "source/requests_data.csv" dataSource = "../data/LastMonthRequests.csv" commonWordslist = "../data/90.txt" incidentNumberIndex = 0 #1 summaryIndex = 1 #4 tier1Index = 2 #9 tier2Index = 3 #10 tier3Index = 4 #11 paginationLimit = 100000 recordsLimit = [300000] wordsCounter = 0 requestDict = {} accessDict = {} fileDict = {} lotusDict = {} outlookDict = {} emailDict = {} awsDict = {} azureDict = {} sqlDict = {} def filterText(record, field): cleanWords = set(ttws(record[field])) for word in cleanWords:
from __future__ import print_function from sys import stdout, maxsize import csv maxInt = 2147483647 decrement = True while decrement: # decrease the maxInt value by factor 10 # as long as the OverflowError occurs. # http://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072 decrement = False try: csv.field_size_limit(maxInt) except OverflowError: maxInt = int(maxInt / 10) decrement = True verb = False """ Doc : A class to read the SWATH scoring logs from Peakview and OpenSWATH The data is returned from the "parse_files" functions as a list of runs. Each run contains a number of precursors and each precursors contains peakgroups (e.g. peakgroups that were found in chromatographic space). usage: reader = SWATHScoringReader.newReader(options.infiles, options.file_format)
import os import sys import numpy as np from collections import defaultdict import cPickle as pickle import pandas as pd import csv from scipy import spatial from nltk.corpus import stopwords from nltk.stem import PorterStemmer import re import string import codecs import argparse csv.field_size_limit(sys.maxsize) reload(sys) sys.setdefaultencoding('utf-8') parser = argparse.ArgumentParser() parser.add_argument('--bursty_issues_dir', help='Folder containing burst wise issues', default='Sample_Data/burst_issues/') parser.add_argument('--raw_commit_dir', help='Directory containing raw commit files per project', default='Sample_Data/raw_commits/') parser.add_argument( '--ins_del_count_dir', help= 'Directory containing insertion/deletion counts per commit per project', default='Sample_Data/ModRequest/insertion_deletion_counts/')
def combine_cluster_labels(config: configparser.SectionProxy): # If you have a partially-processed dataset disable the parts you have # already done write_joined_cluster_labels = True write_cluster_epsilon_representative_docid = True logger.info("Reading intermediate file") hierarchyMaxEpsilon: int = int(get_common_config()['HierarchyMaxEpsilon']) csv.field_size_limit(2**23) fin = open(config['IntermediateColumnFilteredFileName'], 'r', encoding='utf-8') fin.readline() fout = open(config['OutputDocumentMetadataFileName'], 'w', encoding='utf-8', newline='') logger.info("Writing document metadata file") csvreader = csv.reader(fin, delimiter=',') csvwriter = csv.writer(fout, delimiter=',') csvwriter.writerow(['documentId, firstScrapedDate, title']) hash_doc = {} for line in csvreader: (documentId, firstScrapedDate, title, domain, text) = line hash_doc[documentId] = title csvwriter.writerow([documentId, firstScrapedDate, title]) fout.close() fin.close() logger.info("Finished writing document metadata") fin = open(get_epsilon_cluster_filename(config, hierarchyMaxEpsilon), 'r') hash_clusterhit = {} for line2 in fin: clusterid = int(line2.split(',')[0]) docid = line2.split(',')[1].strip() hash_clusterhit[docid] = str(clusterid) fin.close() if write_joined_cluster_labels: logger.info("Writing joined cluster labels") fout = open(config['OutputJoinedClusterLabelsFileName'], 'w') fout.write('clusterid,docid,epsilon\n') hash_clustereps: Dict[Tuple[str, int], List[str]] = {} hash_clusterstats: Dict[Tuple[str, int], int] = {} for x in range(0, int(hierarchyMaxEpsilon) + 1): fin = open(get_epsilon_cluster_filename(config, x), 'r') for line3 in fin: # Get the epsilon and the clusterid clusterid = int(line3.split(',')[0]) ky = (str(x), clusterid) # Track the size of the cluster if ky not in hash_clusterstats: hash_clusterstats[ky] = 0 # for computing size later hash_clusterstats[ky] += 1 docid = line3.split(',')[1].strip() if ky not in hash_clustereps: hash_clustereps[ky] = [] hash_clustereps[ky].append(docid) fout.write(line3.strip() + ',' + str(x) + '\n') fin.close() fout.close() if write_cluster_epsilon_representative_docid: logger.info("Writing cluster epsilon representative docids") fout = open(config['IntermediateClusterEpsilonRepresentativeDocId'], 'w', encoding='utf-8') _ = fout.write( 'epsilon,clusterid,representativedocid,ancestorclusterid,size,titlesummary\n' ) for ky2 in hash_clustereps: repdoc: Optional[str] = None maxtitlehits = 0 hash_titles = {} bow_arry = [] # Skip the -1 clusters. These are nodes/documents that do not fit in a cluster if ky2[1] == -1: continue # iterate through for each cluster/eps pairing and get the most frequently occuring title for doc in hash_clustereps[ky2]: title = hash_doc[doc] bow_arry.append(title) if repdoc is None: repdoc = doc maxtitlehits = 1 hash_titles[title] = (doc, 1) continue else: if title in hash_titles: pair = hash_titles[title] hits = 1 + pair[1] if hits > maxtitlehits: maxtitlehits = hits repdoc = pair[0] # save the incremented value hash_titles[title] = (pair[0], hits) else: hash_titles[title] = (doc, 1) count_vec = sklearn.feature_extraction.text.CountVectorizer( 'content', stop_words='english') try: model = count_vec.fit(bow_arry) invmodel = {v: k for k, v in model.vocabulary_.items()} fit_matrix = count_vec.transform(bow_arry) # can make this a lot faster if we just sum by column! # Summation by column topterms = [] columnsums = fit_matrix.sum(axis=0) # Get the top items arryidx = numpy.array(columnsums)[0] # Sort the array by index - keep the top 10 terms sortedarry = numpy.flip(numpy.argsort(arryidx), axis=0)[0:10] # Then pull out the vocab terms for vocab_key in sortedarry: topterms.append(invmodel[vocab_key]) # Join them together for a summary termconcat2 = '\t'.join(topterms) except Exception: termconcat2 = '' _ = fout.write( f'{str(ky2[0])},{str(ky2[1])},{repdoc},{str(hash_clusterhit[str(repdoc)])},' f'{str(hash_clusterstats[ky2])},{termconcat2}\n') fout.close() logger.info("Finished combine_cluster_labels")
from Orange.data import ( _io, is_discrete_values, MISSING_VALUES, Table, Domain, Variable, DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable, ) from Orange.util import Registry, flatten, namegen # Support values longer than 128K (i.e. text contents features) csv.field_size_limit(100 * 1024 * 1024) __all__ = ["Flags", "FileFormat"] _IDENTITY = lambda i: i class Compression: """Supported compression extensions""" GZIP = ".gz" BZIP2 = ".bz2" XZ = ".xz" all = (GZIP, BZIP2, XZ)
#!/usr/bin/env python import sys import csv import errno csv.field_size_limit( sys.maxsize) # Or else it cannot handle fields longer than 131072 tabin = csv.reader(sys.stdin, dialect=csv.excel_tab) commaout = csv.writer(sys.stdout, dialect=csv.excel, lineterminator='\n') try: for row in tabin: commaout.writerow(row) except IOError as e: if e.errno == errno.EPIPE: pass else: raise
import sys import csv csv.field_size_limit(sys.maxint) from itertools import repeat def select_data(data_num, percentage): new_data_num = [] for i in data_num: new_data_num.append(int(i * percentage)) return new_data_num def data_select( chunk, fold, user_select, file ): #fold = can generate percentage of the data for test and for training user_log_num = [] count = 1 cur_user = 0 index_count = 0 index = 0 with open(file, 'rb') as tsvin: Input = csv.reader(tsvin, delimiter='\t') for row in Input: if (cur_user != row[0]): cur_user = row[0] user_log_num.append(count) count = 1 else: count = count + 1 user_log_num.append(count)
def main(): parser = argparse.ArgumentParser( description = 'VSM - Feature Extraction') help_msgs = [] # Input arguments # help_msgs.append('corpus_pp path') help_msgs.append('predefined vectors path') help_msgs.append('word embeddings path') # Output arguments # help_msgs.append('features vsm path') default_paths = [join(getcwd(), 'vsm')] # Input arguments # parser.add_argument('corpus_pp_path', help=help_msgs[0]) parser.add_argument('predefined_vectors_path', help=help_msgs[1]) parser.add_argument('word_embeddings_path', help=help_msgs[2]) # Output arguments # parser.add_argument('--output', action='store', metavar='PATH', default=default_paths[0], help=help_msgs[3], dest='output_path') # Arguments parsing # args = parser.parse_args() # Check if input exists and is a directory. Otherwise, exit # No extra indentation. if not isdir(args.corpus_pp_path): sys.exit('The input path does not point to a valid directory') # Create the 'Output' directory if it does not exist # if not isdir(args.output_path): makedirs(args.output_path) corpus = [] users = [] register_dialect('tab', delimiter='\t') filepath = join(args.corpus_pp_path, 'corpus_pp.tsv') field_size_limit(sys.maxsize) with open(filepath, 'rt') as fp: r = reader(fp, dialect='tab') # Discard Header # next(r) # * username # * content_pp for row in r: username = row[0] content_pp = row[1] corpus.append(content_pp) users.append(username) vectorizer = CountVectorizer() # In case you are interested in trying other types other weighting schemes # vectorizer = TfidfVectorizer() term_doc_mtx = vectorizer.fit_transform(corpus).toarray() idx2term = vectorizer.get_feature_names() print(' -- Term-Doc Matrix Created --') # Load Personality (big-5) Vectors # filepath = join(args.predefined_vectors_path, 'pb5.vec') pb5_vec = load_predefined_vectors(filepath) # Load Personality Disorders Vectors # filepath = join(args.predefined_vectors_path, 'pd.vec') pd_vec = load_predefined_vectors(filepath) # Load word embeddings # word_embb = KeyedVectors.load_word2vec_format(args.word_embeddings_path, binary=True) print(' -- Word Embeddings loaded --') # For each row of the document-term matrix: # # * For each term t_i in document d_j # # * Compute sim(t_i, v_k) where v is a word in pb5_vector/pd_vector # # * Compute the average of the similarities obtained # x = vsm(idx2term, term_doc_mtx, pb5_vec, pd_vec, word_embb) # Replace nan with zero # np.nan_to_num(x, copy=False) filepath = join(args.output_path, 'personality_scores') np.save(filepath, x) filepath = join(args.output_path, 'users.txt') with open(filepath, 'w') as fp: for username in users: fp.write('%s\n' % username) return 0
def main(): args = parse_args() csv.field_size_limit(sys.maxsize) print("A) Load data") transform = transforms.Compose([ transforms.Resize(255), transforms.CenterCrop(224), transforms.ToTensor() ]) if args.tiny: train_data = CocoCaptions(args.train_img_path, args.train_ann_file, transform=transform, split="tiny") dev_data = CocoCaptions(args.train_img_path, args.train_ann_file, transform=transform, split="tiny") elif args.restval: train_data = CocoCaptions((args.train_img_path, args.dev_img_path), (args.train_ann_file, args.dev_ann_file), transform=transform, split="restval") dev_data = CocoCaptions(args.dev_img_path, args.dev_ann_file, transform=transform, split="dev") else: train_data = CocoCaptions(args.train_img_path, args.train_ann_file, transform=transform, split="train") dev_data = CocoCaptions(args.dev_img_path, args.dev_ann_file, transform=transform, split="dev") print("B) Load model") if args.model == "vse++": raise NotImplementedError elif args.model == "univse": if args.simple: model = simp_univse.UniVSE.from_filename(args.vocab_file, train_cnn=args.train_cnn) else: model = univse.UniVSE.from_filename(args.vocab_file, train_cnn=args.train_cnn) model.vocabulary_encoder.add_graphs(args.graph_file) # Randomize modifier model.vocabulary_encoder.modif = torch.nn.Embedding( len(model.vocabulary_encoder.corpus), 100) model.vocabulary_encoder.modif.weight.data.uniform_(-0.1, 0.1) model.vocabulary_encoder.modif.weight.data[ model.vocabulary_encoder.train_corpus_length:] = torch.zeros( (len(model.vocabulary_encoder.corpus) - model.vocabulary_encoder.train_corpus_length, 100)) else: print("ERROR: model name unknown." ) # You shouldn't be able to reach here! return device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = model.to(device) # Observe that all parameters are being optimized optimizer = optim.Adam(model.params, lr=args.lr) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[args.lr_update], gamma=0.1) print("C) Train model") train_params = { 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': 6 } train_gen = data.DataLoader(train_data, **train_params) dev_params = { 'batch_size': args.batch_size, 'shuffle': False, 'num_workers': 6 } dev_gen = data.DataLoader(dev_data, **dev_params) train_losses = [] dev_losses = [] ir_r1_1k, ir_r5_1k, ir_r10_1k = [], [], [] tr_r1_1k, tr_r5_1k, tr_r10_1k = [], [], [] ir_r1_5k, ir_r5_5k, ir_r10_5k = [], [], [] tr_r1_5k, tr_r5_5k, tr_r10_5k = [], [], [] best_model_wts = copy.deepcopy(model.state_dict()) best_modif_emb = copy.deepcopy(model.vocabulary_encoder.modif) best_rsum = 0 t_epoch = tqdm(range(1, args.epochs + 1), desc="Epoch") for epoch in t_epoch: if epoch > 2: model.criterion.n_r = 1.0 # Each epoch has a training and validation phase for phase in ['train', 'dev']: if phase == 'train': generator = train_gen model.train_start() # Set model to training mode else: generator = dev_gen model.val_start() # Set model to evaluate mode running_loss = 0.0 idx = 0 img_embeddings = np.zeros((len(dev_data), args.hidden_size)) cap_embeddings = np.zeros((len(dev_data), args.hidden_size)) count = 0 # Iterate over data. t_batch = tqdm(generator, desc="Batch", leave=False) for img, sent in t_batch: sentences = list(sent) embeddings = model(img, sentences) time_start = time.time() total_loss, other_loss = model.criterion(embeddings) if not args.simple: model.times["loss"] += time.time() - time_start # ####### DEBUG ######## # if not args.simple and epoch == 1 and idx == 100: with open("times.txt", "w") as t_file: t_file.write(f" # EPOCH {epoch}\t# BATCH {idx} #\n") t_file.write( f"Image: {model.times['image'] * 1000 / model.times['n']} ms\n" ) t_file.write( f"Input: {model.times['input'] * 1000 / model.times['n']} ms\n" ) t_file.write( f"Vocab: {model.times['vocab'] * 1000 / model.times['n']} ms\n" ) t_file.write( f"Object: {model.times['object'] * 1000 / model.times['n']} ms\n" ) t_file.write( f"Neural: {model.times['neural'] * 1000 / model.times['n']} ms\n" ) t_file.write( f"Compos: {model.times['comp'] * 1000 / model.times['n']} ms\n" ) t_file.write( f"Unflat: {model.times['unflatten'] * 1000 / model.times['n']} ms\n" ) t_file.write( f"Loss: {model.times['loss'] * 1000 / model.times['n']} ms\n" ) t_file.write(f"\n") if phase == "dev": aux_count = count + embeddings["sent_emb"].size(0) img_embeddings[count:aux_count] = embeddings[ "img_emb"].data.cpu().numpy().copy() cap_embeddings[count:aux_count] = embeddings[ "sent_emb"].data.cpu().numpy().copy() count = aux_count if phase == "train": optimizer.zero_grad() total_loss.backward() if model.grad_clip > 0: clip_grad_norm_(model.params, model.grad_clip) optimizer.step() lr_scheduler.step(epoch - 1) total_loss = float(total_loss.data.cpu().numpy()) t_batch.set_description(f"Batch Loss: {total_loss:.6f}") running_loss += total_loss idx += 1 running_loss /= idx if phase == "train": train_losses.append(running_loss) else: dev_losses.append(running_loss) # Compute R@k values for 1K Validation rt = itr.i2t(img_embeddings[:5000], cap_embeddings[:5000], measure='cosine', return_ranks=False) ri = itr.t2i(img_embeddings[:5000], cap_embeddings[:5000], measure='cosine', return_ranks=False) current_rsum_1k = ri[0] + ri[1] + ri[2] + rt[0] + rt[1] + rt[2] ir_r1_1k.extend([ri[0]]) ir_r5_1k.extend([ri[1]]) ir_r10_1k.extend([ri[2]]) tr_r1_1k.extend([rt[0]]) tr_r5_1k.extend([rt[1]]) tr_r10_1k.extend([rt[2]]) # Compute R@k values for 5K Validation rt = itr.i2t(img_embeddings, cap_embeddings, measure='cosine', return_ranks=False) ri = itr.t2i(img_embeddings, cap_embeddings, measure='cosine', return_ranks=False) current_rsum = ri[0] + ri[1] + ri[2] + rt[0] + rt[1] + rt[2] t_epoch.set_description( f"Epoch RSum: {current_rsum_1k:.1f} (1K) / {current_rsum:.1f} (5K)" ) ir_r1_5k.extend([ri[0]]) ir_r5_5k.extend([ri[1]]) ir_r10_5k.extend([ri[2]]) tr_r1_5k.extend([rt[0]]) tr_r5_5k.extend([rt[1]]) tr_r10_5k.extend([rt[2]]) # Deep copy the model if it's the best rsum if current_rsum > best_rsum: del best_modif_emb, best_model_wts best_rsum = current_rsum best_modif_emb = copy.deepcopy( model.vocabulary_encoder.modif) best_model_wts = copy.deepcopy(model.state_dict()) # Plot recall@k values if args.plot and epoch > 1: fig = plotter.plot_recall_curve( range(1, epoch + 1), ir_r1_1k, ir_r5_1k, ir_r10_1k, title="Image Retrieval (1K)") plt.savefig( os.path.join( args.output_path, f"training_recalls_{args.model}_ir_1k.png")) plt.close(fig) fig = plotter.plot_recall_curve( range(1, epoch + 1), tr_r1_1k, tr_r5_1k, tr_r10_1k, title="Text Retrieval (1K)") plt.savefig( os.path.join( args.output_path, f"training_recalls_{args.model}_tr_1k.png")) plt.close(fig) fig = plotter.plot_recall_curve( range(1, epoch + 1), ir_r1_5k, ir_r5_5k, ir_r10_5k, title="Image Retrieval (5K)") plt.savefig( os.path.join( args.output_path, f"training_recalls_{args.model}_ir_5k.png")) plt.close(fig) fig = plotter.plot_recall_curve( range(1, epoch + 1), tr_r1_5k, tr_r5_5k, tr_r10_5k, title="Text Retrieval (5K)") plt.savefig( os.path.join( args.output_path, f"training_recalls_{args.model}_tr_5k.png")) plt.close(fig) # Save intermediate loss and recall plots after the second epoch if args.plot and phase == "dev" and epoch > 1: fig = plotter.plot_loss_curve(range(1, epoch + 1), train_losses, dev_losses, yexp=True) plt.savefig( os.path.join(args.output_path, f"training_losses_{args.model}.png")) plt.close(fig) model.load_state_dict(best_model_wts) model.save_model(os.path.join(args.output_path, f"best_{args.model}.pth")) model.vocabulary_encoder.modif = best_modif_emb model.vocabulary_encoder.save_corpus( os.path.join(args.output_path, f"best_corpus_{args.model}.pickle")) with open(os.path.join(args.output_path, "losses.pickle"), "wb") as f: losses = {"train": train_losses, "dev": dev_losses} pickle.dump(losses, f) with open(os.path.join(args.output_path, "recalls_at_k.pickle"), "wb") as f: recalls_at_k = { "ir_r1_1k": ir_r1_1k, "ir_r5_1k": ir_r5_1k, "ir_r10_1k": ir_r10_1k, "tr_r1_1k": tr_r1_1k, "tr_r5_1k": tr_r5_1k, "tr_r10_1k": tr_r10_1k, "ir_r1_5k": ir_r1_5k, "ir_r5_5k": ir_r5_5k, "ir_r10_5k": ir_r10_5k, "tr_r1_5k": tr_r1_5k, "tr_r5_5k": tr_r5_5k, "tr_r10_5k": tr_r10_5k } pickle.dump(recalls_at_k, f)
sys.path.insert(0, '/edx/app/hadoop/pipeline/local/lib/python2.7/site-packages') from edx.analytics.tasks.common.mapreduce import MapReduceJobTask, MapReduceJobTaskMixin from edx.analytics.tasks.common.mysql_load import MysqlInsertTask from edx.analytics.tasks.common.sqoop import SqoopImportFromMysql from edx.analytics.tasks.export.database_exports import FIELD_SIZE_LIMIT, StudentModuleRecord from edx.analytics.tasks.util import csv_util from edx.analytics.tasks.util.url import get_target_from_url, url_path_join log = logging.getLogger(__name__) # Increase maximum number of characters per field since we have # entries that easily exceed the default value of 124 KB. csv.field_size_limit(FIELD_SIZE_LIMIT) ###################################################################### # Abstract Import and Histogram Calculation Section # ###################################################################### class HistogramTaskFromSqoopParamsMixin(object): """ Mixin the parameters for HistogramsFromStudentModule that involve Sqoop """ name = luigi.Parameter(description='Name of this run', ) dest = luigi.Parameter( description='URL of S3 location/directory where the task outputs', ) credentials = luigi.Parameter( config_path={
def setUp(self): self.lim = csv.field_size_limit() with open('.test.csv', 'w') as f: f.write('a' * 10)
def __init__(self, streamID, write=False, fields=None, missingValues=None, bookmark=None, includeMS=True, firstRecord=None): """ streamID: CSV file name, input or output write: True or False, open for writing if True fields: a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only applicable when write==True missingValues: what missing values should be replaced with? bookmark: a reference to the previous reader, if passed in, the records will be returned starting from the point where bookmark was requested. Either bookmark or firstRecord can be specified, not both. If bookmark is used, then firstRecord MUST be None. includeMS: If false, the microseconds portion is not included in the generated output file timestamp fields. This makes it compatible with reading in from Excel. firstRecord: 0-based index of the first record to start reading from. Either bookmark or firstRecord can be specified, not both. If bookmark is used, then firstRecord MUST be None. Each field is a 3-tuple (name, type, special or FieldMetaSpecial.none) The name is the name of the field. The type is one of the constants in `FieldMetaType`. The special is one of the `FieldMetaSpecial` values that designate their field as the sequenceId, reset, timestamp, or category. With exception of multiple categories, there can be at most one of each. There may be multiple fields of type datetime, but no more than one of them may be the timestamp field (FieldMetaSpecial.timestamp). The sequence id field must be either a string or an int. The reset field must be an int (and must contain 0 or 1). The category field must be an int or space-separated list of ints, where the former represents single-label classification and the latter is for multi-label classification (e.g. "1 3 4" designates a record for labels 1, 3, and 4). The number of categories is allowed to vary record to record; sensor regions represent non-categories with -1, thus the category values must be >= 0. The FileRecordStream iterates over the field names, types and specials and stores the information. """ super(FileRecordStream, self).__init__() # Only bookmark or firstRow can be specified, not both if bookmark is not None and firstRecord is not None: raise RuntimeError( "Only bookmark or firstRecord can be specified, not both") if fields is None: fields = [] if missingValues is None: missingValues = [''] # We'll be operating on csvs with arbitrarily long fields size = 2**27 csv.field_size_limit(size) self._filename = streamID # We can't guarantee what system files are coming from, use universal # newlines self._write = write self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE self._file = open(self._filename, self._mode) self._sequences = set() self.rewindAtEOF = False if write: assert fields is not None assert isinstance(fields, (tuple, list)) # Verify all fields are 3-tuple assert all( isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3 for f in fields) names, types, specials = zip(*fields) self._writer = csv.writer(self._file) else: # Make sure readline() works on windows too os.linesep = '\n' # Read header lines self._reader = csv.reader(self._file, dialect="excel") try: names = [n.strip() for n in self._reader.next()] except: raise Exception('The header line of the file %s contained a NULL byte' \ % self._filename) types = [t.strip() for t in self._reader.next()] specials = [s.strip() for s in self._reader.next()] # If there are no specials, this means there was a blank line if len(specials) == 0: specials = [""] if not len(names) == len(types) == len(specials): raise Exception('Invalid file format: different number of fields ' 'in the header rows of file %s (%d, %d, %d)' % (streamID, len(names), len(types), len(specials))) # Verify standard file format for t in types: if not FieldMetaType.isValid(t): raise Exception( 'Invalid file format for "%s" - field type "%s" ' 'not a valid FieldMetaType' % ( self._filename, t, )) for s in specials: if not FieldMetaSpecial.isValid(s): raise Exception( 'Invalid file format. \'%s\' is not a valid special ' 'flag' % s) self._fields = [ FieldMetaInfo(*attrs) for attrs in zip(names, types, specials) ] self._fieldCount = len(self._fields) # Keep track on how many records have been read/written self._recordCount = 0 self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if FieldMetaSpecial.timestamp in specials else None) self._resetIdx = (specials.index(FieldMetaSpecial.reset) if FieldMetaSpecial.reset in specials else None) self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if FieldMetaSpecial.sequence in specials else None) self._categoryIdx = (specials.index(FieldMetaSpecial.category) if FieldMetaSpecial.category in specials else None) self._learningIdx = (specials.index(FieldMetaSpecial.learning) if FieldMetaSpecial.learning in specials else None) # keep track of the current sequence self._currSequence = None self._currTime = None if self._timeStampIdx: assert types[self._timeStampIdx] == FieldMetaType.datetime if self._sequenceIdIdx: assert types[self._sequenceIdIdx] in (FieldMetaType.string, FieldMetaType.integer) if self._resetIdx: assert types[self._resetIdx] == FieldMetaType.integer if self._categoryIdx: assert types[self._categoryIdx] in (FieldMetaType.list, FieldMetaType.integer) if self._learningIdx: assert types[self._learningIdx] == FieldMetaType.integer # Convert the types to the actual types in order to convert the strings if self._mode == self._FILE_READ_MODE: m = { FieldMetaType.integer: intOrNone, FieldMetaType.float: floatOrNone, FieldMetaType.boolean: parseBool, FieldMetaType.string: unescape, FieldMetaType.datetime: parseTimestamp, FieldMetaType.sdr: parseSdr, FieldMetaType.list: parseStringList } else: if includeMS: datetimeFunc = serializeTimestamp else: datetimeFunc = serializeTimestampNoMS m = { FieldMetaType.integer: str, FieldMetaType.float: str, FieldMetaType.string: escape, FieldMetaType.boolean: str, FieldMetaType.datetime: datetimeFunc, FieldMetaType.sdr: serializeSdr, FieldMetaType.list: stripList } self._adapters = [m[t] for t in types] self._missingValues = missingValues # # If the bookmark is set, we need to skip over first N records # if bookmark is not None: rowsToSkip = self._getStartRow(bookmark) elif firstRecord is not None: rowsToSkip = firstRecord else: rowsToSkip = 0 while rowsToSkip > 0: self.next() rowsToSkip -= 1 # Dictionary to store record statistics (min and max of scalars for now) self._stats = None
# set of utilities to interact with files # @author: rm3086 (at) columbia (dot) edu import csv, shutil, os, sys, glob import _pickle as cPickle import struct platform_c_maxint = 2**(struct.Struct('i').size * 8 - 1) - 1 csv.field_size_limit(platform_c_maxint) from .log import strd_logger # logger global log log = strd_logger('file') # check if a file exist def file_exist(fname): try: open(fname, 'r') return True except IOError: return False # create directory if not existing def mkdir(dirname): try: os.makedirs(dirname) except OSError: pass
def tearDown(self): # Resetting limit to avoid failure in other tests. csv.field_size_limit(self.lim) os.remove('.test.csv')
def csvInit(): csv.field_size_limit(1000000000) return 0
import optparse import fileinput import collections import datetime import time """ Essentially reverses the process of bundle-items. Processes the CSV download from MTurk and bursts out multiple items in each HIT. Each field name that ends in "_1", "_2" etc is assumed to be such a multiplexed field. Any other fields will be repeated in the output. Can produce JSON format rather than CSV if desired. """ csv.field_size_limit(10**6) ###################################################################### def maybeOpen(file, mode="r", encoding="utf8"): if type(file) is types.StringType: file = open(file, mode) if encoding: file = (mode == "r" and codecs.getreader or codecs.getwriter)(encoding)(file) return file ######################################################################
def backup(self): """ Backup the database to a local SQLite database @ToDo: Option to use a temporary DB in Postgres/MySQL as this takes too long for a large DB """ moves = self.moves news = self.news strints = self.strints strbools = self.strbools if not moves and not news and not strbools and not strints: # Nothing to backup return import os db = self.db folder = "%s/databases/backup" % current.request.folder # Create clean folder for the backup if os.path.exists(folder): import shutil shutil.rmtree(folder) import time time.sleep(1) os.mkdir(folder) # Setup backup database db_bak = DAL("sqlite://backup.db", folder=folder, adapter_args={"foreign_keys": False}) # Copy Table structure skip = [] for tablename in db.tables: if tablename == "gis_location": table = db[tablename] fields = [ table[field] for field in table.fields if field != "the_geom" ] try: db_bak.define_table(tablename, *fields) except KeyError: # Can't resolve reference yet # Cleanup del db_bak[tablename] # Try later skip.append(tablename) else: try: db_bak.define_table(tablename, db[tablename]) except KeyError: # Can't resolve reference yet # Cleanup del db_bak[tablename] # Try later skip.append(tablename) while skip: _skip = [] for tablename in skip: if tablename == "gis_location": table = db[tablename] fields = [ table[field] for field in table.fields if field != "the_geom" ] try: db_bak.define_table(tablename, *fields) except KeyError: # Can't resolve reference yet # Cleanup del db_bak[tablename] # Try later _skip.append(tablename) except: import sys print "Skipping %s: %s" % (tablename, sys.exc_info()[1]) else: try: db_bak.define_table(tablename, db[tablename]) except KeyError: # Can't resolve reference yet # Cleanup del db_bak[tablename] # Try later _skip.append(tablename) except: import sys print "Skipping %s: %s" % (tablename, sys.exc_info()[1]) skip = _skip # Which tables do we need to backup? tables = [] if moves: for tablename in moves: tables.append(tablename) if news: for tablename in news: new = news[tablename] for t in new["tables"]: tables.append(t) for s in new["supers"]: tables.append(s) stable = db[s] rows = db(stable._id > 0).select(stable.instance_type) instance_types = set([r.instance_type for r in rows]) for t in instance_types: tables.append(t) if strbools: for tablename, fieldname in strints: tables.append(tablename) if strints: for tablename, fieldname in strints: tables.append(tablename) # Remove duplicates tables = set(tables) # Copy Data import csv csv.field_size_limit(2**20 * 100) # 100 megs for tablename in tables: filename = "%s/%s.csv" % (folder, tablename) file = open(filename, "w") rows = db(db[tablename].id > 0).select() rows.export_to_csv_file(file) file.close() file = open(filename, "r") db_bak[tablename].import_from_csv_file( file, unique="uuid2") # uuid2 designed to not hit! file.close() db_bak.commit() # Pass handle back to other functions self.db_bak = db_bak
print('lda took: ', end - start, ' seconds') return lda, get_doc_topic(corpus, lda, id_to_bow), dictionary def viz(lda, corpus, dictionary, html_fn): lda_visualization = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False) pyLDAvis.save_html(lda_visualization, html_fn) #main <---------------- k = int(sys.argv[2]) fn = sys.argv[1] csv.field_size_limit(524288) print("reading in text data...") id_and_text = [] id_to_source = dict() with open(fn, newline='') as data: reader = csv.reader(data) for line in reader: idn = line[0] text = line[1] id_to_source[idn] = line[2] text = text.replace('‘', '\'').replace('’', '\'').replace( '“',
class BasicProcessor(BasicWorker, metaclass=abc.ABCMeta): """ Abstract post-processor class A post-processor takes a finished search query as input and processed its result in some way, with another result set as output. The input thus is a CSV file, and the output (usually) as well. In other words, the result of a post-processor run can be used as input for another post-processor (though whether and when this is useful is another question). """ db = None # database handler dataset = None # Dataset object representing the dataset to be created job = None # Job object that requests the execution of this processor parent = None # Dataset object to be processed, if applicable source_file = None # path to dataset to be processed, if applicable description = "No description available" # processor description, shown in web front-end category = "Other" # processor category, for sorting in web front-end extension = "csv" # extension of files created by this processor options = {} # configurable options for this processor parameters = {} # values for the processor's configurable options # Tumblr posts can overflow the regular limit, so double this. csv.field_size_limit(131072 * 2) def work(self): """ Process a dataset Loads dataset metadata, sets up the scaffolding for performing some kind of processing on that dataset, and then processes it. Afterwards, clean up. """ try: self.dataset = DataSet(key=self.job.data["remote_id"], db=self.db) except TypeError: # query has been deleted in the meantime. finish without error, # as deleting it will have been a conscious choice by a user self.job.finish() return if self.dataset.data.get("key_parent", None): # search workers never have parents (for now), so we don't need to # find out what the parent dataset is if it's a search worker try: self.parent = DataSet(key=self.dataset.data["key_parent"], db=self.db) except TypeError: # we need to know what the parent dataset was to properly handle the # analysis self.log.warning( "Processor %s queued for orphan query %s: cannot run, cancelling job" % (self.type, self.dataset.key)) self.job.finish() return if not self.parent.is_finished(): # not finished yet - retry after a while self.job.release(delay=30) return self.parent = DataSet(key=self.dataset.data["key_parent"], db=self.db) self.source_file = self.parent.get_results_path() if not self.source_file.exists(): self.dataset.update_status("Finished, no input data found.") self.log.info("Running post-processor %s on query %s" % (self.type, self.job.data["remote_id"])) self.parameters = self.dataset.parameters self.dataset.update_status("Processing data") self.dataset.update_version(get_software_version()) if self.interrupted: return self.abort() if not self.dataset.is_finished(): try: self.process() self.after_process() except WorkerInterruptedException: self.abort() except Exception as e: frames = traceback.extract_tb(e.__traceback__) frames = [ frame.filename.split("/").pop() + ":" + str(frame.lineno) for frame in frames[1:] ] location = "->".join(frames) # Not all datasets have parent keys if len(self.dataset.get_genealogy()) > 1: parent_key = " (via " + self.dataset.get_genealogy( )[0].key + ")" else: parent_key = "" raise ProcessorException( "Processor %s raised %s while processing dataset %s%s in %s:\n %s\n" % (self.type, e.__class__.__name__, self.dataset.key, parent_key, location, str(e))) else: # dataset already finished, job shouldn't be open anymore self.log.warning( "Job %s/%s was queued for a dataset already marked as finished, deleting..." % (self.job.data["jobtype"], self.job.data["remote_id"])) self.job.finish() def after_process(self): """ After processing, declare job finished """ if self.dataset.data["num_rows"] > 0: self.dataset.update_status("Dataset saved.") if not self.dataset.is_finished(): self.dataset.finish() # see if we have anything else lined up to run next for next in self.parameters.get("next", []): next_parameters = next.get("parameters", {}) next_type = next.get("type", "") available_processors = self.dataset.get_available_processors() # run it only if the post-processor is actually available for this query if next_type in available_processors: next_analysis = DataSet( parameters=next_parameters, type=next_type, db=self.db, parent=self.dataset.key, extension=available_processors[next_type]["extension"]) self.queue.add_job(next_type, remote_id=next_analysis.key) # see if we need to register the result somewhere if "copy_to" in self.parameters: # copy the results to an arbitrary place that was passed if self.dataset.get_results_path().exists(): shutil.copyfile(str(self.dataset.get_results_path()), self.parameters["copy_to"]) else: # if copy_to was passed, that means it's important that this # file exists somewhere, so we create it as an empty file with open(self.parameters["copy_to"], "w") as empty_file: empty_file.write("") # see if this query chain is to be attached to another query # if so, the full genealogy of this query (minus the original dataset) # is attached to the given query - this is mostly useful for presets, # where a chain of processors can be marked as 'underlying' a preset if "attach_to" in self.parameters: try: # copy metadata and results to the surrogate surrogate = DataSet(key=self.parameters["attach_to"], db=self.db) if self.dataset.get_results_path().exists(): shutil.copyfile(str(self.dataset.get_results_path()), str(surrogate.get_results_path())) top_parent = self.dataset.get_genealogy()[1] top_parent.link_parent(surrogate.key) try: surrogate.finish(self.dataset.data["num_rows"]) except RuntimeError: # already finished, could happen (though it shouldn't) pass surrogate.update_status(self.dataset.get_status()) except ValueError: # dataset with key to attach to doesn't exist... self.log.warning( "Cannot attach dataset chain containing %s to %s (dataset does not exist)" % (self.dataset.key, self.parameters["attach_to"])) self.job.finish() def abort(self): """ Abort dataset creation and clean up so it may be attempted again later """ # remove any result files that have been created so far if self.dataset.get_results_path().exists(): os.unlink(str(self.dataset.get_results_path())) if self.dataset.get_temporary_path().exists(): shutil.rmtree(str(self.dataset.get_temporary_path())) # we release instead of finish, since interrupting is just that - the # job should resume at a later point. Delay resuming by 10 seconds to # give 4CAT the time to do whatever it wants (though usually this isn't # needed since restarting also stops the spawning of new workers) self.dataset.update_status( "Dataset processing interrupted. Retrying later.") if self.interrupted == self.INTERRUPT_RETRY: # retry later - wait at least 10 seconds to give the backend time to shut down self.job.release(delay=10) elif self.interrupted == self.INTERRUPT_CANCEL: # cancel job self.job.finish() def iterate_csv_items(self, path): """ A generator that iterates through a CSV file With every iteration, the processor's 'interrupted' flag is checked, and if set a ProcessorInterruptedException is raised, which by default is caught and subsequently stops execution gracefully. :param Path path: Path to csv file to read :return: """ with open(path, encoding="utf-8") as input: reader = csv.DictReader(input) for item in reader: if self.interrupted: raise ProcessorInterruptedException( "Processor interrupted while iterating through CSV file" ) yield item def write_csv_items_and_finish(self, data): """ Write data as csv to results file and finish dataset Determines result file path using dataset's path determination helper methods. After writing results, the dataset is marked finished. Will raise a ProcessorInterruptedException if the interrupted flag for this processor is set while iterating. :param data: A list or tuple of dictionaries, all with the same keys """ if not (isinstance(data, typing.List) or isinstance(data, typing.Tuple)) or isinstance(data, str): raise TypeError( "write_csv_items requires a list or tuple of dictionaries as argument" ) if not data: raise ValueError( "write_csv_items requires a dictionary with at least one item") if not isinstance(data[0], dict): raise TypeError( "write_csv_items requires a list or tuple of dictionaries as argument" ) self.dataset.update_status("Writing results file") with self.dataset.get_results_path().open("w", encoding="utf-8", newline='') as results: writer = csv.DictWriter(results, fieldnames=data[0].keys()) writer.writeheader() for row in data: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while writing results file") writer.writerow(row) self.dataset.update_status("Finished") self.dataset.finish(len(data)) def is_filter(self): """ Is this processor a filter? Filters do not produce their own dataset but replace the parent dataset instead. :todo: Make this a bit more robust than sniffing the processor category :return bool: """ return hasattr( self, "category" ) and self.category and "filter" in self.category.lower() @abc.abstractmethod def process(self): """ Process data To be defined by the child processor. """ pass
from collections import defaultdict import codecs import csv from nltk.tokenize import RegexpTokenizer import numpy as np import tensorflow as tf import argparse import model.data seed = 42 np.random.seed(seed) tf.set_random_seed(seed) csv.field_size_limit(2**28) tokenizer = RegexpTokenizer(r'\w+') cachedStopWords = stopwords.words("english") home_dir = os.getenv("HOME") def loadGloveModel(gloveFile=None, hidden_size=None): if gloveFile is None: if hidden_size == 50: gloveFile = os.path.join(home_dir, "resources/pretrained_embeddings/glove.6B.50d.txt") elif hidden_size == 100: gloveFile = os.path.join(home_dir, "resources/pretrained_embeddings/glove.6B.100d.txt") elif hidden_size == 200: gloveFile = os.path.join(home_dir, "resources/pretrained_embeddings/glove.6B.200d.txt")
if __name__ == "__main__": csvSet = { "100": "./dataset/data_100.csv", "8000": "./dataset/data_8000.csv", "4000": "./dataset/data_4000.csv", "articles1": "./dataset/articles1.csv", "articles2": "./dataset/articles2.csv", "articles3": "./dataset/articles3.csv" } csvPath = csvSet[sys.argv[1]] connection = pika.BlockingConnection( pika.ConnectionParameters('localhost')) channel = connection.channel() # queue_name = ["queue_coba", "queue_1", "queue_2", "queue_3"] queue_name = sys.argv[2:] i = 0 with open(csvPath) as csvFile: csv.field_size_limit(10000000) csvReader = csv.DictReader(csvFile) for row in csvReader: jsonData = {} _id = row["id"] content = unicode(row["content"], errors="ignore") jsonData["id"] = _id jsonData["content"] = content produce(queue_name[i], jsonData) i += 1 if i > len(queue_name) - 1: i = 0 pass
import csv import ctypes from os import cpu_count from os.path import dirname, join import sys from tangentcft.TangentS.math_tan import latex_mml from tangentcft.TangentS.math_tan.mathml import MathML CSV_PARAMETERS = { 'delimiter': '\t', 'quotechar': '"', 'quoting': csv.QUOTE_MINIMAL, } csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) sys.setrecursionlimit(15000) ARQMATH_INPUT_DATA_DIRNAME = '/mnt/storage/ARQMath_CLEF2020' ARQMATH_OUTPUT_DATA_DIRNAME = 'output_data/ARQMath_CLEF2020' ARQMATH_COLLECTION_INPUT_DATA_DIRNAME = '{}/Collection'.format( ARQMATH_INPUT_DATA_DIRNAME) ARQMATH_COLLECTION_OUTPUT_DATA_DIRNAME = '{}/Collection'.format( ARQMATH_OUTPUT_DATA_DIRNAME) ARQMATH_COLLECTION_QRELS_FILENAME = '{}/votes-qrels.V1.2.tsv'.format( ARQMATH_COLLECTION_OUTPUT_DATA_DIRNAME) ARQMATH_COLLECTION_POSTS_LATEX_FILENAME = '{}/Posts.V1.2_latex.json.gz'.format(
def __init__(self, streamID, write=False, fields=None, missingValues=None, bookmark=None, includeMS=True, firstRecord=None): """ Constructor streamID: CSV file name, input or output write: True or False, open for writing if True fields: a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only applicable when write==True missingValues: what missing values should be replaced with? bookmark: a reference to the previous reader, if passed in, the records will be returned starting from the point where bookmark was requested. Either bookmark or firstRecord can be specified, not both. If bookmark is used, then firstRecord MUST be None. includeMS: If false, the microseconds portion is not included in the generated output file timestamp fields. This makes it compatible with reading in from Excel. firstRecord: 0-based index of the first record to start reading from. Either bookmark or firstRecord can be specified, not both. If bookmark is used, then firstRecord MUST be None. Each field is a 3-tuple (name, type, special or '') The name is the name of the field. The type is one of: 'string', 'datetime', 'int', 'float', 'bool' The special is either empty or one of S, R, T, C that designate their field as the sequenceId, reset, timestamp, or category. There can be at most one of each. There may be multiple fields of type datetime, but no more than one of them may be the timestamp field (T). The sequence id field must be either a string or an int. The reset field must be an int (and must contain 0 or 1). The category field must be an int. The FileRecordStream iterates over the field names, types and specials and stores the information. """ # Call superclass constructor super(FileRecordStream, self).__init__() # Only bookmark or firstRow can be specified, not both if bookmark is not None and firstRecord is not None: raise RuntimeError("Only bookmark or firstRecord can be specified, not both") if fields is None: fields = [] if missingValues is None: missingValues = [''] # We'll be operating on csvs with arbitrarily long fields size = 2**27 csv.field_size_limit(size) self._filename = streamID # We can't guarantee what system files are coming from, use universal # newlines self._write = write self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE self._file = open(self._filename, self._mode) self._sequences = set() self.rewindAtEOF = False if write: assert fields is not None assert isinstance(fields, (tuple, list)) # Verify all fields are 3-tuple assert all(isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3 for f in fields) names, types, specials = zip(*fields) self._writer = csv.writer(self._file) else: os.linesep = '\n' # make sure readline() works on windows too. # Read header lines self._reader = csv.reader(self._file, dialect='excel', quoting=csv.QUOTE_NONE) try: names = [n.strip() for n in self._reader.next()] except: raise Exception('The header line of the file %s contained a NULL byte' \ % self._filename) types = [t.strip() for t in self._reader.next()] specials = [s.strip() for s in self._reader.next()] # If there are no specials, this means there was a blank line if len(specials) == 0: specials=[""] if not(len(names) == len(types) == len(specials)): raise Exception('Invalid file format: different number of fields ' 'in the header rows of file %s (%d, %d, %d)' % (streamID, len(names), len(types), len(specials))) # Verify standard file format allowedTypes = ('string', 'datetime', 'int', 'float', 'bool') for i, t in enumerate(types): # This is a temporary hack for the Precog milestone, which passes in a # type 'address' for address fields. Here we simply map the type "address" # to "string". if t == 'address': types[i] = 'string' t = 'string' if t not in allowedTypes: raise Exception('Invalid file format for "%s" - field type "%s" ' 'not one of %s ' % (self._filename, t, allowedTypes)) for s in specials: if s not in ('', 'T', 'R', 'S', 'C', 'L'): raise Exception('Invalid file format. \'%s\' is not a valid special ' 'flag' % s) self._fields = [FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)] self._fieldCount = len(self._fields) # Keep track on how many records have been read/written self._recordCount = 0 self._timeStampIdx = specials.index('T') if 'T' in specials else None self._resetIdx = specials.index('R') if 'R' in specials else None self._sequenceIdIdx = specials.index('S') if 'S' in specials else None self._categoryIdx = specials.index('C') if 'C' in specials else None self._learningIdx = specials.index('L') if 'L' in specials else None # keep track of the current sequence self._currSequence = None self._currTime = None if self._timeStampIdx: assert types[self._timeStampIdx] == 'datetime' if self._sequenceIdIdx: assert types[self._sequenceIdIdx] in ('string', 'int') if self._resetIdx: assert types[self._resetIdx] == 'int' if self._categoryIdx: assert types[self._categoryIdx] == 'int' if self._learningIdx: assert types[self._learningIdx] == 'int' # Convert the types to the actual types in order to convert the strings if self._mode == self._FILE_READ_MODE: m = dict(int=intOrNone, float=floatOrNone, bool=parseBool, string=unescape, datetime=parseTimestamp) else: if includeMS: datetimeFunc = serializeTimestamp else: datetimeFunc = serializeTimestampNoMS m = dict(int=str, float=str, string=escape, bool=str, datetime=datetimeFunc) self._adapters = [m[t] for t in types] self._missingValues = missingValues # # If the bookmark is set, we need to skip over first N records # if bookmark is not None: rowsToSkip = self._getStartRow(bookmark) elif firstRecord is not None: rowsToSkip = firstRecord else: rowsToSkip = 0 while rowsToSkip > 0: self.next() rowsToSkip -= 1 # Dictionary to store record statistics (min and max of scalars for now) self._stats = None
import os import io import csv import ast import sys import math import struct from enum import Enum from exceptions import CSVError, SchemaError csv.field_size_limit(sys.maxsize) # Don't limit the size of user input fields. class Type(Enum): UNKNOWN = 0 BOOL = 1 DOUBLE = 2 FLOAT = 2 # alias to DOUBLE STRING = 3 LONG = 4 INT = 4 # alias to LONG INTEGER = 4 # alias to LONG ARRAY = 5 ID = 6 START_ID = 7 END_ID = 8 IGNORE = 9 def convert_schema_type(in_type): try:
#!/usr/bin/env python import sys import csv csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields import os import argparse import colored_traceback.always # if you move this script, you'll need to change this method of getting the imports partis_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '') sys.path.insert(1, partis_dir + '/python') import utils import glutils from clusterpath import ClusterPath helpstr = """ Script to extract sequences from a partis output file and write them to a fasta file. For details of partis output files, see the manual. Example usage: ./bin/extract-fasta.py --input-file partis-output.yaml --fasta-output-file out.fa # extact all sequences from best partition in <partis-output.yaml> """ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): pass formatter_class = MultiplyInheritedFormatter parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr) parser.add_argument('--input-file', required=True, help='partis output file') parser.add_argument('--fasta-output-file', required=True, help='output fasta file name') parser.add_argument('--partition-index', type=int, help='if set, use the partition at this index in the cluster path, rather than the default of using the best partition') parser.add_argument('--seed-unique-id', help='if set, take sequences only from the cluster containing this seed sequence, rather than the default of taking all sequences from all clusters') parser.add_argument('--cluster-index', type=int, help='if set, take sequences only from the cluster at this index in the partition, rather than the default of taking all sequences from all clusters')
class DSVParser(interface.FileObjectParser): """Delimiter separated values (DSV) parser interface.""" # A list that contains the names of all the fields in the log file. This # needs to be defined by each DSV parser. COLUMNS = [] # The default delimiter is a comma, but a tab, pipe or other character are # known to be used. Note the delimiter must be a byte string otherwise csv # module can raise a TypeError indicating that "delimiter" must be a single # character string. DELIMITER = b',' # If there is a header before the lines start it can be defined here, and # the number of header lines that need to be skipped before the parsing # starts. NUMBER_OF_HEADER_LINES = 0 # If there is a special quote character used inside the structured text # it can be defined here. QUOTE_CHAR = b'"' # The maximum size of a single field in the parser FIELD_SIZE_LIMIT = csv.field_size_limit() # Value that should not appear inside the file, made to test the actual # file to see if it confirms to standards. _MAGIC_TEST_STRING = b'RegnThvotturMeistarans' def __init__(self, encoding=None): """Initializes a delimiter separated values (DSV) parser. Args: encoding (Optional[str]): encoding used in the DSV file, where None indicates the codepage of the parser mediator should be used. """ super(DSVParser, self).__init__() self._encoding = encoding if py2to3.PY_2: self._end_of_line = b'\n' else: self._end_of_line = '\n' self._maximum_line_length = ( len(self._end_of_line) + len(self.COLUMNS) * (self.FIELD_SIZE_LIMIT + len(self.DELIMITER))) def _ConvertRowToUnicode(self, parser_mediator, row): """Converts all strings in a DSV row dict to Unicode. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. row (dict[str, bytes]): a row from a DSV file, where the dictionary key contains the column name and the value a binary string. Returns: dict[str, str]: a row from the DSV file, where the dictionary key contains the column name and the value a Unicode string. """ for key, value in iter(row.items()): if isinstance(value, py2to3.UNICODE_TYPE): continue try: row[key] = value.decode(self._encoding) except UnicodeDecodeError: replaced_value = value.decode(self._encoding, errors='replace') parser_mediator.ProduceExtractionWarning( 'error decoding DSV value: {0:s} as {1:s}, characters have been ' 'replaced in {2:s}'.format(key, self._encoding, replaced_value)) row[key] = replaced_value return row def _CreateDictReader(self, line_reader): """Returns a reader that processes each row and yields dictionaries. csv.DictReader does this job well for single-character delimiters; parsers that need multi-character delimiters need to override this method. Args: line_reader (iter): yields lines from a file-like object. Returns: iter: a reader of dictionaries, as returned by csv.DictReader(). """ delimiter = self.DELIMITER quotechar = self.QUOTE_CHAR magic_test_string = self._MAGIC_TEST_STRING # Python 3 csv module requires arguments to constructor to be of type str. if py2to3.PY_3: delimiter = delimiter.decode(self._encoding) quotechar = quotechar.decode(self._encoding) magic_test_string = magic_test_string.decode(self._encoding) return csv.DictReader( line_reader, delimiter=delimiter, fieldnames=self.COLUMNS, quotechar=quotechar, restkey=magic_test_string, restval=magic_test_string) # pylint: disable=missing-return-type-doc def _CreateLineReader(self, file_object): """Creates an object that reads lines from a text file. The line reader is advanced to the beginning of the DSV content, skipping any header lines. Args: file_object (dfvfs.FileIO): file-like object. Returns: TextFile|BinaryLineReader: an object that implements an iterator over lines in a text file. Raises: UnicodeDecodeError: if the file cannot be read with the specified encoding. """ # The Python 2 csv module reads bytes and the Python 3 csv module Unicode # reads strings. if py2to3.PY_3: line_reader = text_file.TextFile( file_object, encoding=self._encoding, end_of_line=self._end_of_line) # pylint: disable=protected-access maximum_read_buffer_size = line_reader._MAXIMUM_READ_BUFFER_SIZE else: line_reader = line_reader_file.BinaryLineReader( file_object, end_of_line=self._end_of_line) maximum_read_buffer_size = line_reader.MAXIMUM_READ_BUFFER_SIZE # Line length is one less than the maximum read buffer size so that we # tell if there's a line that doesn't end at the end before the end of # the file. if self._maximum_line_length > maximum_read_buffer_size: self._maximum_line_length = maximum_read_buffer_size - 1 # If we specifically define a number of lines we should skip, do that here. for _ in range(0, self.NUMBER_OF_HEADER_LINES): line_reader.readline(self._maximum_line_length) return line_reader def _HasExpectedLineLength(self, file_object): """Determines if a file begins with lines of the expected length. As we know the maximum length of valid lines in the DSV file, the presence of lines longer than this indicates that the file will not be parsed successfully, without reading excessive data from a large file. Args: file_object (dfvfs.FileIO): file-like object. Returns: bool: True if the file has lines of the expected length. """ original_file_position = file_object.tell() line_reader = self._CreateLineReader(file_object) for _ in range(0, 20): # Attempt to read a line that is longer than any line that should be in # the file. sample_line = line_reader.readline(self._maximum_line_length + 1) if len(sample_line) > self._maximum_line_length: file_object.seek(original_file_position) return False file_object.seek(original_file_position) return True @classmethod def GetFormatSpecification(cls): """Retrieves the format specification. Returns: FormatSpecification: format specification. """ return specification.FormatSpecification(cls.NAME, text_format=True) def ParseFileObject(self, parser_mediator, file_object): """Parses a DSV text file-like object. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. file_object (dfvfs.FileIO): file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ # TODO: Replace this with detection of the file encoding via byte-order # marks. Also see: https://github.com/log2timeline/plaso/issues/1971 if not self._encoding: self._encoding = parser_mediator.codepage try: if not self._HasExpectedLineLength(file_object): display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile(( '[{0:s}] Unable to parse DSV file: {1:s} with error: ' 'unexpected line length.').format(self.NAME, display_name)) except UnicodeDecodeError as exception: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( '[{0:s}] Unable to parse DSV file: {1:s} with error: {2!s}.'.format( self.NAME, display_name, exception)) try: line_reader = self._CreateLineReader(file_object) reader = self._CreateDictReader(line_reader) row_offset = line_reader.tell() row = next(reader) except (StopIteration, csv.Error, UnicodeDecodeError) as exception: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile( '[{0:s}] Unable to parse DSV file: {1:s} with error: {2!s}.'.format( self.NAME, display_name, exception)) number_of_columns = len(self.COLUMNS) number_of_records = len(row) if number_of_records != number_of_columns: display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile(( '[{0:s}] Unable to parse DSV file: {1:s}. Wrong number of ' 'records (expected: {2:d}, got: {3:d})').format( self.NAME, display_name, number_of_columns, number_of_records)) for key, value in row.items(): if self._MAGIC_TEST_STRING in (key, value): display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile(( '[{0:s}] Unable to parse DSV file: {1:s}. Signature ' 'mismatch.').format(self.NAME, display_name)) row = self._ConvertRowToUnicode(parser_mediator, row) if not self.VerifyRow(parser_mediator, row): display_name = parser_mediator.GetDisplayName() raise errors.UnableToParseFile(( '[{0:s}] Unable to parse DSV file: {1:s}. Verification ' 'failed.').format(self.NAME, display_name)) self.ParseRow(parser_mediator, row_offset, row) row_offset = line_reader.tell() for row in reader: if parser_mediator.abort: break row = self._ConvertRowToUnicode(parser_mediator, row) self.ParseRow(parser_mediator, row_offset, row) row_offset = line_reader.tell() @abc.abstractmethod def ParseRow(self, parser_mediator, row_offset, row): """Parses a line of the log file and produces events. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. row_offset (int): offset of the row. row (dict[str, str]): fields of a single row, as specified in COLUMNS. """ # pylint: disable=redundant-returns-doc @abc.abstractmethod def VerifyRow(self, parser_mediator, row): """Verifies if a line of the file is in the expected format.
from scipy import sparse import os import spacy import torch from torchtext import data, datasets from torchtext.vocab import Vectors from torch.nn import init import torch import torch.nn as nn import torch.nn.functional as F import torchtext.vocab as vocab import torch.optim as optim import sys import csv csv.field_size_limit(20000001) base_dir = "/content/drive/My Drive/Colab Notebooks/" LABEL = data.LabelField() #LENGTH = data.Field(use_vocab=False,dtype=torch.long) creative_id_TEXT = data.Field(sequential=True, lower=True, include_lengths=True, fix_length=100) advertiser_id_TEXT = data.Field(sequential=True, lower=True, include_lengths=True, fix_length=100) ad_id_TEXT = data.Field(sequential=True, lower=True, include_lengths=True, fix_length=100)
import collections import sys import csv from collections import defaultdict from collections import Counter csv.field_size_limit( sys.maxsize ) #To expand the limit. The limit is 131072 lines. Our data is over 233000. ###### take transitions data columns = defaultdict(list) #each vlaue in each column is appended to a list with open('user_transitions.csv', "r") as file: #open data file reader = csv.reader(file, delimiter='|', quotechar='"') #read rows into a dictionary format next(reader) # skip header for row in reader: #read a row as {column1 : value1, column2: value2, ...} #print(row) for i, v in enumerate(row): #go over each column name and value columns[i].append( v ) #append the value into the appropriate listbased on column name k tran_id = columns[0] #print(tran_id[0]) tran_uuid = columns[1] tran_from_url = columns[2] tran_to_url = columns[3] tran_cookie_id = columns[4] tran_from_material_model_id = columns[5] tran_to_material_model_id = columns[6]
import sys, csv, json, wppbatchlib csv.field_size_limit(min(2147483647, sys.maxsize)) VERSION = '0.1' AUTHOR = 'Trevor Anderson <*****@*****.**>' iFilePath = None resultsFilePath = None if sys.argv == None or len(sys.argv) != 2 or len( sys.argv[1]) < 5 or sys.argv[1][-14:] != 'rawresults.csv': print 'Drop a CSV file containing raw JSON results onto this program to use it.' print '(you need to run SearchPhone.bat first)' var = raw_input("Hit enter to quit") quit() iFilePath = sys.argv[1] resultsFilePath = sys.argv[1][:-15] + '_results.csv' print 'Extracting Phone Intelligence results from ' + str(iFilePath) csvReader = csv.reader(open(iFilePath, 'rbU'), delimiter=',', quotechar='"') csvWriter = csv.writer(open(resultsFilePath, 'wb'), delimiter=',', quotechar='"') rowNum = 0 for row in csvReader: #each raw results row will contain the original input file row, followed by the API URL, #followed by the JSON response. rowNum += 1
""" A Fast, Offline Reverse Geocoder in Python A Python library for offline reverse geocoding. It improves on an existing library called reverse_geocode developed by Richard Penman. """ from __future__ import print_function __author__ = 'Ajay Thampi' import os import sys import csv if sys.platform == 'win32': # Windows C long is 32 bits, and the Python int is too large to fit inside. # Use the limit appropriate for a 32-bit integer as the max file size csv.field_size_limit(2**31 - 1) else: csv.field_size_limit(sys.maxsize) import zipfile from scipy.spatial import cKDTree as KDTree from reverse_geocoder import cKDTree_MP as KDTree_MP import numpy as np GN_URL = 'http://download.geonames.org/export/dump/' GN_CITIES1000 = 'cities1000' GN_ADMIN1 = 'admin1CodesASCII.txt' GN_ADMIN2 = 'admin2Codes.txt' # Schema of the GeoNames Cities with Population > 1000 GN_COLUMNS = { 'geoNameId': 0, 'name': 1,