# encoding=utf-8 import jieba import csv import re f = open('D:\\Shared\\Rawdata\\Split\\test\\splitBD.txt', 'a', encoding='UTF-8') splitcsv = [] jieba.set_dictionary('dict.txt.big') jieba.load_userdict('dictnew.txt') with open('D:\\Shared\\Rawdata\\Raw\\BD.csv', encoding='UTF-8') as rawcsv: readCSV = csv.DictReader(rawcsv, delimiter=',') for row in readCSV: splitpost = [] temp = row['article'] rr = "[\s+\.\!\/_,$%^*(+\"\'\-]+|[+——!,。?、~@#¥%……&*()「」《》?:·).〈〉:/;◆■◇×=|°│─;“”\[\]→↓Nㄧˋ%\}\{\>\<’`÷‘±↑╱『˙<≠┤‘§€↑╱★ˇ←≧┐└‧+ˊ』>-~\ –ㄟ*※【】,、。.}{()╴—–|·‥…!?:;‧〔〕【】《》〈〉「」『』‘’“”☆◎▲△●○〃§※*&#′‵〞〝★◇◆□■▽▼㊣ˍ﹉﹊﹍﹎﹋﹌♀∕\/∣∥↘↙↗↖→←↓↑⊙⊕♂℅ ̄_+-×÷±√<>=≦≧≠∞$∴∵∮∫㏑㏒⊿∟∠⊥∪∩~≡≒¥〒¢£%€℃℉㏕㎝㎜㎞㏎㎡㎎㎏㏄°▁▂▃▄▅▆▆▇█▏▎▍▌▋▊▉┼┴┴┬┤├▔─│┌▕┐└┘╩╚╣╬╠╗╦╔╡╪╞═╯╰╮╭╝╒╤╕╘╧╛╓╥╖╟╫╢╙╨╜║▓◢◣◥◤╱╲╳˙ˉˊˇˋㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄠㄟㄞㄝㄜㄛㄚㄙㄘㄗㄖㄕㄔㄓㄒㄑㄡㄢㄣㄤㄥㄦㄧㄨㄩ1&423567890qazxswedcvfrtgbnhyujmkiolp︱QAZXSWEDCVFRTGBNHYUJMKILOPⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ@︰﹪﹣]+" article = re.sub(rr, "", temp) words = jieba.cut(article, cut_all=False, HMM=True) for word in words: f.write(word) f.write("\n") f.close() rawcsv.close()
acom_list = [] for ad in acom_diagrams: acom_list.append(str(ad)) all_architectures_list.append(ad) all_architectures_list = sorted(all_architectures_list, key=lambda x: x.name) parsed_articles = [] all_architectures_list = unique(all_architectures_list, cleanup=False) root = path.dirname(path.abspath(__file__)) popularity = [] with open(path.join(root, "popularity.csv"), newline='') as csvfile: reader = csv.DictReader(csvfile) for line in reader: popularity.append(dict(line)) for file in all_architectures_list: article = {} pricing = False deploy = False sample_code = False file_path = Path(file) str_path = str(file_path) # Skip index pages # print(str_path) if is_excluded(file_path):
plt.title("Speed distribution for vessel " + str(mmsi)) plt.xlabel("Speed, Knots") plt.ylabel("Frequency") fig = plt.gcf() fig_size = plt.rcParams["figure.figsize"] # Prints: [8.0, 6.0] # Set figure width to 12 and height to 9 fig_size[0] = 9 fig_size[1] = 2.5 plt.rcParams["figure.figsize"] = fig_size plt.gcf().subplots_adjust(bottom=0.20) fig.set_size_inches(fig_size[0], fig_size[1]) fig.savefig(outdir + str(mmsi) + '.png', dpi=100, bbox_inches='tight') print "succeed, ", mmsi plt.clf() else: print "not enough data for ", mmsi # plt.show() with open(sourcedir + filename, 'rU') as f: reader = csv.DictReader(f, delimiter=',') for row in reader: mmsi = row['mmsi'] # months_2014 = row['months_2014'].split(",") months_2015 = row['months_2015'].split(",") make_histogram(mmsi, months_2015)
def read_schedules(self): """ Go through each student schedule and compile each course and its conflict as well as the total course list and the student object list :return: tuple containing: list of students, list of all courses, dictionary of courses and conflicts (and empty colors) """ with open(self.schedules_csv, newline='') as fp: reader = csv.DictReader(fp) students = [] courses = [] conflicts = {} for row in reader: # get the course numbers for the current student student_course_nums = [] for i in range(1, 7): course_num = row['Course {}'.format(str(i))] if course_num is not None: student_course_nums.append(course_num) # loop through each course that we found for student_course_num in student_course_nums: # disregard blank courses (not a full schedule) if student_course_num is not None: # add it to the full course list if student_course_num not in courses: courses.append(student_course_num) if student_course_num not in conflicts: # Conflicts should be all of the courses that are in the list besides the current course, # as it is the first entry conflicts[student_course_num] = [ x for i, x in enumerate(student_course_nums) if i != student_course_nums.index( student_course_num) ] else: course_conflicts = conflicts[student_course_num] # Add conflicts to the existing course # Should be all courses besides the current one and that are not already in the list new_conflicts = [ x for i, x in enumerate(student_course_nums) if i != student_course_nums.index( student_course_num) and x not in course_conflicts ] # add the new conflicts we found to the list if len(new_conflicts) > 0: for i in range(len(new_conflicts)): course_conflicts.append(new_conflicts[i]) # put it back in the dictionary conflicts[student_course_num] = course_conflicts # add a student object to the list students.append( Student(row['Lastname'], row['Firstname'], student_course_nums)) # set all the course colors (second item in the tuple) to None for now, handled later for course in conflicts: conflicts[course] = (conflicts[course], None) return students, courses, conflicts
def start_requests(self): with open('newsUrlCrawl.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: yield scrapy.Request(row['url'], self.parse_news)
import csv from numpy import * from pymc import * household_data = [d for d in csv.DictReader(open('household.csv'))] county_data = [d for d in csv.DictReader(open('county.csv'))] # hyper-priors g = Uniform('gamma', [-100, -100], [100, 100]) s_a = Uniform('sigma_a', 0, 100) # priors a = {} for d in county_data: @stochastic(name='a_%s' % d['county']) def a_j(value=0., g=g, u_j=float(d['u']), s_a=s_a): return normal_like(value, g[0] + g[1] * u_j, s_a**-2.) a[d['county']] = a_j b = Uniform('beta', -100, 100) s_y = Uniform('sigma_y', 0, 100) # likelihood y = {} for d in household_data: @stochastic(observed=True, name='y_%s' % d['household'])
def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'): validation_image_ids = {} if version == 'v4': annotations_path = os.path.join( metadata_dir, subset, '{}-annotations-bbox.CSV'.format(subset)) elif version == 'challenge2018': validation_image_ids_path = os.path.join( metadata_dir, 'challenge-2018-image-ids-valset-od.CSV') with open(validation_image_ids_path, 'r') as csv_file: reader = csv.DictReader(csv_file, fieldnames=['ImageID']) next(reader) for line, row in enumerate(reader): image_id = row['ImageID'] validation_image_ids[image_id] = True annotations_path = os.path.join( metadata_dir, 'challenge-2018-train-annotations-bbox.CSV') else: annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.CSV') fieldnames = [ 'ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin', 'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside' ] id_annotations = dict() with open(annotations_path, 'r') as csv_file: reader = csv.DictReader(csv_file, fieldnames=fieldnames) next(reader) images_sizes = {} for line, row in enumerate(reader): frame = row['ImageID'] if version == 'challenge2018': if subset == 'train': if frame in validation_image_ids: continue elif subset == 'validation': if frame not in validation_image_ids: continue else: raise NotImplementedError( 'This generator handles only the train and validation subsets' ) class_name = row['LabelName'] if class_name not in cls_index: continue cls_id = cls_index[class_name] if version == 'challenge2018': # We recommend participants to use the provided subset of the training set as a validation set. # This is preferable over using the V4 val/test sets, as the training set is more densely annotated. img_path = os.path.join(main_dir, 'images', 'train', frame + '.jpg') else: img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg') if frame in images_sizes: width, height = images_sizes[frame] else: try: with Image.open(img_path) as img: width, height = img.width, img.height images_sizes[frame] = (width, height) except Exception as ex: if version == 'challenge2018': raise ex continue x1 = float(row['XMin']) x2 = float(row['XMax']) y1 = float(row['YMin']) y2 = float(row['YMax']) x1_int = int(round(x1 * width)) x2_int = int(round(x2 * width)) y1_int = int(round(y1 * height)) y2_int = int(round(y2 * height)) # Check that the bounding box is valid. if x2 <= x1: raise ValueError( 'line {}: x2 ({}) must be higher than x1 ({})'.format( line, x2, x1)) if y2 <= y1: raise ValueError( 'line {}: y2 ({}) must be higher than y1 ({})'.format( line, y2, y1)) if y2_int == y1_int: warnings.warn( 'filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal' .format(line, y2, y1)) continue if x2_int == x1_int: warnings.warn( 'filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal' .format(line, x2, x1)) continue img_id = row['ImageID'] annotation = { 'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2 } if img_id in id_annotations: annotations = id_annotations[img_id] annotations['boxes'].append(annotation) else: id_annotations[img_id] = { 'w': width, 'h': height, 'boxes': [annotation] } return id_annotations
def df_input_schema(_context, path): with open(path, 'r') as fd: return [ OrderedDict(sorted(x.items(), key=lambda x: x[0])) for x in csv.DictReader(fd) ]
def cvs_upload(request): if request.method == "POST": form = CVSUploadForm(request.POST, request.FILES) if form.is_valid(): form.save() CSV_PATH = settings.MEDIA_ROOT + str(CVSUpload.objects.last().file) with open(CSV_PATH, encoding='utf-8-sig') as csvfile: data_reader = csv.DictReader(csvfile) for row in data_reader: print(row['calling_plan'], '있음') try: plan = CallingPlan.objects.get( calling_plan=row['calling_plan'], mobile_carrier=row['mobile_carrier']) if plan: plan.company = row['company'] plan.brand = row['brand'] plan.homepage = row['homepage'] plan.calling_plan = row['calling_plan'] plan.mobile_carrier = row['mobile_carrier'] plan.category = row['category'] plan.data_speed = row['data_speed'] plan.data_category = row['data_category'] plan.data_unit = row['data_unit'] plan.call = row['call'] plan.call_unit = row['call_unit'] plan.unlimited_free = row['unlimited_free'] plan.message = row['message'] plan.message_unit = row['message_unit'] plan.data1 = row['data1'] plan.data2 = row['data2'] plan.pay = row['pay'] plan.promo_pay = row['promo_pay'] # calling_plan.saled_pay1 = row['saled_pay1'] # calling_plan.saled_pay2 = row['saled_pay2'] # calling_plan.saled_pay3 = row['saled_pay3'] # calling_plan.sales_pay1 = row['sales_pay1'] # calling_plan.condition1 = row['condition1'] # calling_plan.sales_pay2 = row['sales_pay2'] # calling_plan.condition2 = row['condition2'] # calling_plan.sales_pay3 = row['sales_pay3'] # calling_plan.condition3 = row['condition3'] # calling_plan.etc1 = row['etc1'] # calling_plan.etc2 = row['etc2'] # calling_plan.etc3 = row['etc3'] plan.activation = row['activation'] plan.update_date = timezone.localtime() plan.save() except ObjectDoesNotExist: print(row['calling_plan'], '추가') data = CallingPlan.objects.create( company=row['company'], brand=row['brand'], homepage=row['homepage'], calling_plan=row['calling_plan'], mobile_carrier=row['mobile_carrier'], category=row['category'], data_speed=row['data_speed'], data_category=row['data_category'], data_unit=row['data_unit'], call=row['call'], call_unit=row['call_unit'], unlimited_free=row['unlimited_free'], message=row['message'], message_unit=row['message_unit'], data1=row['data1'], data2=row['data2'], pay=row['pay'], promo_pay=row['promo_pay'], # saled_pay1 = row['saled_pay1'], # saled_pay2 = row['saled_pay2'], # saled_pay3 = row['saled_pay3'], # sales_pay1 = row['sales_pay1'], # condition1 = row['condition1'], # sales_pay2 = row['sales_pay2'], # condition2 = row['condition2'], # sales_pay3 = row['sales_pay3'], # condition3 = row['condition3'], # etc1 = row['etc1'], # etc2 = row['etc2'], # etc3 = row['etc3'], activation=row['activation'], update_date='', create_date=timezone.localtime()) return render(request, 'epost/cvs_uploaded.html') else: form = CVSUploadForm() return render(request, 'epost/cvs_upload.html', {'form': form})
Translation.query.delete() TranslationExample.query.delete() Chapter.query.delete() Book.query.delete() GrammaticalTerm.query.delete() Grammar.query.delete() GrammarExample.query.delete() v_count = 0 t_count = 0 te_count = 0 g_count = 0 ge_count = 0 with open('csv/main.csv') as csv_file: csv_reader = csv.DictReader(csv_file, delimiter=',') for row in csv_reader: v_id = row['vocabulary.id'] v_hanzi = row['vocabulary.hanzi'] v_chapter_id = row['vocabulary.chapter_id'] v_number_in_chapter = row['vocabulary.number_in_chapter'] if v_id and v_hanzi and v_chapter_id and v_number_in_chapter: v_count += 1 v = Vocabulary(id=v_id, hanzi=v_hanzi, chapter_id=v_chapter_id, number_in_chapter=v_number_in_chapter) db.session.add(v) t_id = row['translation.id'] t_vocabulary_id = row['translation.vocabulary_id'] t_translation_en = row['translation.translation_en']
def loadparamsfromcsv(csvfilename, runs): """ Load and parse the csv file for the given set of runs and return nested dictionary: a collection of dictionaries, one for each csv row matching a run number. """ import csv import os.path from sys import exit # use sys.exit instead of built-in exit (latter raises exception) class CommentedFile: """ Decorator for text files: filters out comments (i.e. first char of line #) Based on http://www.mfasold.net/blog/2010/02/python-recipe-read-csvtsv-textfiles-and-ignore-comment-lines/ """ def __init__(self, f, commentstring="#"): self.f = f self.commentstring = commentstring self.linecount = 0 def rewind(self): self.f.seek(0) self.linecount = 0 def next(self): line = self.f.next() self.linecount += 1 while line.startswith(self.commentstring) or not line.strip(): # test if line commented or empty line = self.f.next() self.linecount += 1 return line def __iter__(self): return self log = logging.getLogger('jobsub') parameters_csv = {} # store all information needed from the csv file if csvfilename is None: return parameters_csv # if no file name given, return empty collection here if not os.path.isfile(csvfilename): # check if file exists log.error("Could not find the specified csv file '"+csvfilename+"'!") exit(1) try: log.debug("Opening csv file '"+csvfilename+"'.") csvfile = open(csvfilename, 'rb') filteredfile = CommentedFile(csvfile) try: # contruct a sample for the csv format sniffer: sample = "" try: while (len(sample)<1024): sample += filteredfile.next() except StopIteration: log.debug("End of csv file reached, sample limited to " + str(len(sample))+ " bytes") dialect = csv.Sniffer().sniff(sample) # test csv file format details log.debug("Determined the CSV dialect as follows: delimiter=%s, doublequote=%s, escapechar=%s, lineterminator=%s, quotechar=%s , quoting=%s, skipinitialspace=%s", dialect.delimiter, dialect.doublequote, dialect.escapechar, list(ord(c) for c in dialect.lineterminator), dialect.quotechar, dialect.quoting, dialect.skipinitialspace) filteredfile.rewind() # back to beginning of file reader = csv.DictReader(filteredfile, dialect=dialect) # now process CSV file contents here and load them into memory reader.next() # python < 2.6 requires an actual read access before filling 'DictReader.fieldnames' log.debug("CSV file contains the header info: %s", reader.fieldnames) try: reader.fieldnames = [field.lower() for field in reader.fieldnames] # convert to lower case keys to avoid confusion reader.fieldnames = [field.strip() for field in reader.fieldnames] # remove leading and trailing white space except TypeError: log.error("Could not process the CSV file header information. csv.DictReader returned fieldnames: %s", reader.fieldnames) exit(1) if not "runnumber" in reader.fieldnames: # verify that we have a column "runnumber" log.error("Could not find a column with header label 'RunNumber' in file '"+csvfilename+"'!") exit(1) if "" in reader.fieldnames: log.warning("Column without header label encountered in csv file '"+csvfilename+"'!") log.info("Successfully loaded csv file'"+csvfilename+"'.") # first: search through csv file to find corresponding runnumber entry line for every run filteredfile.rewind() # back to beginning of file reader.next() # .. and skip the header line missingRuns = list(runs) # list of runs to look for in csv file for row in reader: # loop over all rows once try: for run in missingRuns: # check all runs if runnumber matches if int(row["runnumber"]) == run: log.debug("Found entry in csv file for run "+str(run)+" on line "+ str(filteredfile.linecount)) parameters_csv[run] = {} parameters_csv[run].update(row) missingRuns.remove(run) break except ValueError: # int conversion error log.warn("Could not interpret run number on line "+str(filteredfile.linecount)+" in file '"+csvfilename+"'.") continue if len(missingRuns)==0: log.debug("Finished search for runs in csv file before reaching end of file") break log.debug("Searched over "+str(filteredfile.linecount)+" lines in file '"+csvfilename+"'.") if not len(missingRuns)==0: log.error("Could not find an entry for the following run numbers in '"+csvfilename+"': "+', '.join(map(str, missingRuns))) finally: csvfile.close() except csv.Error, e: log.error("Problem loading the csv file '"+csvfilename+"'(%s): %s"%(e.errno, e.strerror)) exit(1)
def flatten_class(out_loc, zoo_file): with open(out_loc, 'w', newline='', encoding='utf-8') as file: fieldnames = [ 'subject_ids', 'filename', 'user_name', 'workflow_id', 'workflow_version', 'classification_id', 'created_at', 'fluke_bounding_boxes', 'fluke_tip_points', 'fluke_notch_points' ] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() # this area for initializing counters: i = 0 j = 0 with open(zoo_file, 'r', encoding='utf-8') as csvfile: csvreader = csv.DictReader(csvfile, delimiter=',', quotechar='"') for row in csvreader: i += 1 # # useful for debugging - set the number of record to process at a low number ~1000 # if i == 1000: # break if include(row) is True: j += 1 anns = json.loads(row['annotations']) subj = json.loads(row['subject_data']) # recover the subject filename from the subject-data filename = '' for k in subj: if "filename" in subj[k]: filename = subj[k]['filename'] elif "Filename" in subj[k]: filename = subj[k]['Filename'] else: print("No filename found") print(subj) filename = filename.lower() fluke_bounding_boxes = [] fluke_tip_points = [] fluke_notch_points = [] for ann in anns: try: # pull out boxes if ann['task'] == 'T1': for drawing_object in ann['value']: if pull_rectangle(drawing_object): fluke_bounding_boxes.append( pull_rectangle(drawing_object)) # pull out tip points if ann['task'] == 'T2': for drawing_object in ann['value']: if pull_point(drawing_object): fluke_tip_points.append( pull_point(drawing_object)) # pull out notch points if ann['task'] == 'T3': for drawing_object in ann['value']: if pull_point(drawing_object): fluke_notch_points.append( pull_point(drawing_object)) except KeyError: continue writer.writerow({ 'subject_ids': row['subject_ids'], 'filename': filename, 'user_name': row['user_name'], 'workflow_id': row['workflow_id'], 'workflow_version': row['workflow_version'], 'classification_id': row['classification_id'], 'created_at': row['created_at'], 'fluke_bounding_boxes': json.dumps(fluke_bounding_boxes), 'fluke_tip_points': json.dumps(fluke_tip_points), 'fluke_notch_points': json.dumps(fluke_notch_points) }) if i % 10000 == 0: print('flatten', i, j) return str(i) + ' Lines read and ' + str(j) + ' records processed'
def aggregate(sorted_loc, aggregated_loc): with open(aggregated_loc, 'w', newline='', encoding='utf-8') as file: fieldnames = [ 'subject_ids', 'filename', 'classifications', 'boxes', 'box_clusters', 'bclusters', 'tips', 'tip_clusters', 'tclusters', 'notches', 'notch_clusters', 'nclusters', 'flukes' ] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() # set up to read the flattened file with open(sorted_loc, 'r', encoding='utf-8') as f: r = csv.DictReader(f) # initialize a starting point subject and empty bins for aggregation subject = '' users = '' filename = '' i = 1 j = 0 boxes = [] tips = [] notches = [] # Loop over the flattened classification records for row in r: j += 1 if j % 10000 == 0: print('aggregating', j) # read a row and pullout the flattened data fields we need to aggregate, or output. new_subject = row['subject_ids'] new_filename = row['filename'] new_user = row['user_name'] row_boxes = json.loads(row['fluke_bounding_boxes']) row_tips = json.loads(row['fluke_tip_points']) row_notches = json.loads(row['fluke_notch_points']) # test for change in selector - output on change if new_subject != subject: if i != 1: # if not the first line analyse the aggregated fields and output the results new_row = process_aggregation(subject, filename, i, boxes, tips, notches) writer.writerow(new_row) # reset the selector, those things we need to output and the bins for the aggregation. i = 1 subject = new_subject filename = new_filename users = {new_user} boxes = row_boxes tips = row_tips notches = row_notches else: # do the aggregation - clean for excess classifications and multiple classifications by the same # user on this subject if users != users | {new_user}: users |= {new_user} boxes.extend(row_boxes) tips.extend(row_tips) notches.extend(row_notches) i += 1 # catch and process the last aggregated group new_row = process_aggregation(subject, filename, i, boxes, tips, notches) writer.writerow(new_row)
'\n') if retry.lower() != 'y': quit() else: flatten_class(out_location, zooniverse_file) sort_file(out_location, sorted_location, 0, False, True) aggregate(sorted_location, aggregated_location) # crawl the image directory and acquire the filenames imageFilenames, imageFilenameMap = get_filenames(fluke_images_dir) # load the aggregated WAI data and proceed to loop over the valid flukes. Load the matching image if any and # rotate and crop the image and save the cropped image. with open(aggregated_location, 'r', encoding='utf-8') as ag_file: images_not_processed = [] r_ag = csv.DictReader(ag_file) for line in r_ag: fluke_positons = json.loads(line['flukes']) image = line['filename'] if image not in imageFilenames: continue else: # a match has been found with one of the current images being analysed. realFilename = imageFilenameMap[image] # Read the image imageData = cv.imread(fluke_images_dir + os.sep + realFilename) width, height = imageData.shape[1], imageData.shape[0] counter = 0 if len( fluke_positons ) < 5: # the invalid "something weird" fluke positions fail this test
def load(self,csv_file_name): raw_XX = [] # 3D list (2nd dim is mutable) raw_Y = [] # 2D list (2nd dim is mutable) raw_AA = [] raw_MM = [] with open(csv_file_name) as csv_file: reader = csv.DictReader(csv_file,delimiter=';') past_name = None X = [] y = [] A = [] M = [] for row in reader: # Each row corresponds to a frame (bar) # Using 'filename_sv' to determine song boundaries if past_name != row['filename_sv']: if X: raw_XX.append(X) if y: raw_Y.append(y) if A: raw_AA.append(A) if M: raw_MM.append(M) X = [] y = [] A = [] M = [] past_name = row['filename_sv'] # Get rid of songs with no key if not row['key']: continue # Note: mode not currently used key, mode = self._process_key(row['key']) self.keys.append(key) X_i = self._process_Xi(row['tpc_hist_counts']) y_i = self._process_yi(row['chords_raw'],row['chord_types_raw'],key) A_i = self._process_Ai(row['tpc_raw']) M_i = self._process_Mi(row['metrical_weight']) # get rid of bars with no chords if not y_i: continue X.append(X_i) y.append(y_i) A.append(A_i) M.append(M_i) if X: raw_XX.append(X) if y: raw_Y.append(y) if A: raw_AA.append(A) if M: raw_MM.append(M) self.XX = self._process_XX(raw_XX) self.Y = self._process_Y(raw_Y) self.AA = self._process_AA(raw_AA) self.MM = self._process_MM(raw_MM)
ontologyMap = {} with open(args.ontology, 'rt') as fin: for line in fin: if line.startswith("[Term]"): clId = "" clName = "" elif line.startswith("id: CL:"): clId = line[7:].rstrip() elif line.startswith("name: ") and clId != "": clName = line[6:].rstrip() ontologyMap["CL_" + clId] = clName ontologyGeneMap = {} with open(args.input) as tsvfile: reader = csv.DictReader(tsvfile, dialect='excel-tab') with open(args.output, 'wt') as fout: for row in reader: olo = row["CellOntologyID"] if olo == "NA": continue if olo not in ontologyMap: print("Cannot find ontology " + olo) continue genes = row["geneSymbol"] if not olo in ontologyGeneMap: ontologyGeneMap[olo] = {}
def read_data(data_file, types_file, miss_file, true_miss_file): #Read types of data from data file with open(types_file) as f: types_dict = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)] #Read data from input file with open(data_file, 'r') as f: data = [[float(x) for x in rec] for rec in csv.reader(f, delimiter=',')] data = np.array(data) #Sustitute NaN values by something (we assume we have the real missing value mask) if true_miss_file: with open(true_miss_file, 'r') as f: missing_positions = [[int(x) for x in rec] for rec in csv.reader(f, delimiter=',')] missing_positions = np.array(missing_positions) true_miss_mask = np.ones([np.shape(data)[0],len(types_dict)]) true_miss_mask[missing_positions[:,0]-1,missing_positions[:,1]-1] = 0 #Indexes in the csv start at 1 data_masked = np.ma.masked_where(np.isnan(data),data) #We need to fill the data depending on the given data... data_filler = [] for i in range(len(types_dict)): if types_dict[i]['type'] == 'cat' or types_dict[i]['type'] == 'ordinal': aux = np.unique(data[:,i]) data_filler.append(aux[0]) #Fill with the first element of the cat (0, 1, or whatever) else: data_filler.append(0.0) data = data_masked.filled(data_filler) else: true_miss_mask = np.ones([np.shape(data)[0],len(types_dict)]) #It doesn't affect our data #Construct the data matrices data_complete = [] for i in range(np.shape(data)[1]): if types_dict[i]['type'] == 'cat': #Get categories cat_data = [int(x) for x in data[:,i]] categories, indexes = np.unique(cat_data,return_inverse=True) #Transform categories to a vector of 0:n_categories new_categories = np.arange(int(types_dict[i]['dim'])) cat_data = new_categories[indexes] #Create one hot encoding for the categories aux = np.zeros([np.shape(data)[0],len(new_categories)]) aux[np.arange(np.shape(data)[0]),cat_data] = 1 data_complete.append(aux) elif types_dict[i]['type'] == 'ordinal': #Get categories cat_data = [int(x) for x in data[:,i]] categories, indexes = np.unique(cat_data,return_inverse=True) #Transform categories to a vector of 0:n_categories new_categories = np.arange(int(types_dict[i]['dim'])) cat_data = new_categories[indexes] #Create thermometer encoding for the categories aux = np.zeros([np.shape(data)[0],1+len(new_categories)]) aux[:,0] = 1 aux[np.arange(np.shape(data)[0]),1+cat_data] = -1 aux = np.cumsum(aux,1) data_complete.append(aux[:,:-1]) else: data_complete.append(np.transpose([data[:,i]])) data = np.concatenate(data_complete,1) #Read Missing mask from csv (contains positions of missing values) n_samples = np.shape(data)[0] n_variables = len(types_dict) miss_mask = np.ones([np.shape(data)[0],n_variables]) #If there is no mask, assume all data is observed if os.path.isfile(miss_file): with open(miss_file, 'r') as f: missing_positions = [[int(x) for x in rec] for rec in csv.reader(f, delimiter=',')] missing_positions = np.array(missing_positions) miss_mask[missing_positions[:,0]-1,missing_positions[:,1]-1] = 0 #Indexes in the csv start at 1 return data, types_dict, miss_mask, true_miss_mask, n_samples
def read_and_create(container, data, mappings, object_type, create_new=False, primary_key='id', counts = {}): new_count = 0 existing_count = 0 ignore_count = 0 report = [] # TODO(ivanteoh): Make sure the object have all the valid keys # keys = resources[0].keys() # hasProperty, getProperty not working catalog = api.portal.get_tool(name="portal_catalog") container_path = "/".join(container.getPhysicalPath()) # TODO(ivanteoh): Make sure container is either folder or SiteRoot reader = csv.DictReader(data.splitlines(), delimiter=",", dialect="excel", quotechar='"') # use IURLNormalizer instead of IIDNormalizer for url id normalizer = getUtility(IURLNormalizer) # return only fields are needed. for row in reader: ## set primary_key #if primary_key not in row: # continue #key_value = row[primary_key].decode("utf-8") ## http://docs.plone.org/develop/plone/misc/normalizing_ids.html ## Normalizers to safe ids #fields[KEY_ID] = normalizer.normalize(key_value) key_arg = {} for key, value in row.items(): if not key: continue if key in mappings: key_arg[mappings[key].decode("utf-8")] = \ value.decode("utf-8") # find existing obj obj = None if primary_key and primary_key not in key_arg: obj = None # in this case we shouldn't create or update it ignore_count += 1 continue if primary_key in ['_path','id','_url']: if primary_key == '_url': path = '/'.join(getRequest().physicalPathFromURL(key_arg[primary_key])) if not path.startswith(container_path): ignore_count += 1 continue path = path[len(container_path):].lstrip('/') else: path = key_arg[primary_key].encode().lstrip('/') obj = container.restrictedTraverse(path, None) if obj is None: # special case because id gets normalised. # try and guess the normalised id if primary_key == 'id': # just in case id has '/' in path = normalizer.normalize(key_arg[primary_key].encode()) else: path = path.rsplit('/',1) path[-1] = normalizer.normalize(path[-1]) path = '/'.join(path) obj = container.restrictedTraverse(path, None) if 'id' not in key_arg: # ensure we don't use title key_arg['id'] = path.split('/')[-1] if obj is not None: existing_count += 1 elif primary_key and primary_key in key_arg: # TODO: this is wrong since indexs aren't always the same as fields # Should check if there is an index, else back down to find util query = dict(path={"query": container_path, "depth": 1}, # portal_type=object_type, ) query[primary_key]=key_arg[primary_key] results = catalog(**query) if len(results) > 1: assert "Primary key must be unique" ignore_count += 1 continue elif len(results) == 1: obj = results[0].getObject() existing_count += 1 if obj is None and create_new: #TODO: handle creating using passed in path. ie find/create folders # Save the objects in this container #TODO: validate we either have a id or title (or make random ids) #TODO: currently lets you create files without a require file field #which breaks on view obj = api.content.create( type=object_type, container=container, safe_id=True, **{key: key_arg[key] for key in ['id','title'] if key in key_arg} ) new_count += 1 elif obj is None: ignore_count += 1 continue #if not checkPermission("zope.Modify", obj): # ignore_count += 1 # continue key_arg['_path'] = '/'.join(obj.getPhysicalPath())[len(container_path)+1:] if 'id' in key_arg: del key_arg['id'] # otherwise transmogrifier renames it yield key_arg # TODO(ivanteoh): any performance risk by calling this? #TODO: only do this is we changed somthing notify(ObjectModifiedEvent(obj)) #TODO: need to implement stop feature assert obj.id # generate report for csv export # key_arg[u"id"] = obj.id # key_arg[u'path'] = obj.absolute_url() # report.append(obj) # Later if want to rename # api.content.rename(obj=portal["blog"], new_id="old-blog") counts.update( {"existing_count": existing_count, "new_count": new_count, "ignore_count": ignore_count, "report": report} )
# ДЗ №3: функции # Дедлайн: 04 ноября 18:14 # Результат присылать на адрес [email protected] # также прочитайте раздел "Функции" из книги "A byte of Python" (с.59) # Задание: сделайте анализ возрастного состава группы студентов, используя функции. # Помните, что а) у некоторых студентов отсутствуют данные по возрасту, б) возраст может быть задан диапазоном, например, 25-35. Поэтому не забывайте обрабатывать ошибки и исключения! import csv # помним, что в этот раз мы читаем не список списков, а список словарей! # ключи в словаре для каждого студента называются по первой строчке из файла student_ages.csv: "Номер в списке", "Возраст" ages_list = list() with open('/Users/andreymakarov/Downloads/mai_python_2019/03 Functions/ages.csv', encoding="utf-8") as csvfile: ages_dictreader = csv.DictReader(csvfile, delimiter=',') ages_list = list(ages_dictreader) #print(ages_list) # подсказка: вот так мы можем получить данные из списка словарей # именно так мы уже делали в коде лекции с квартирами for al in ages_list: print(f'"Номер в списке": {al["Номер в списке"]}, "Возраст": {al["Возраст"]}') print() # Задание 1: напишите функцию, которая разделяет выборку студентов на две части: # меньше или равно указанного возраста и больше указанного возраста # вернуться должна пара "Номер в списке, Возраст" print("ПЕРВАЯ ФУНКЦИЯ") print()
from functions import getTBAdata import csv from scipy.stats.mstats import gmean from numpy import std dps = {} with open("DistrictRankings/YearlyPredictor/data_award.csv") as file: reader = csv.DictReader(file) for row in reader: dps[row["Team"]] = float(row["Avg"]) scores = [] events = getTBAdata("events/2019/keys") for event in events: print(event) teams = getTBAdata("event/" + event + "/teams/keys") if len(teams) < 2: continue localdps = [dps[t] if t in dps and dps[t]!=0 else 1 for t in teams] scores.append((event, gmean(localdps), std(localdps))) scores = sorted(scores, key=lambda x:x[1]/x[2], reverse=True) with open("DistrictRankings/YearlyPredictor/ranked_events.csv", "w+") as file: file.write("Event,Mean,StDev,Score\n") for e in scores: file.write(e[0] + "," + str(e[1]) + "," + str(e[2]) + "," + str(e[1]/e[2]) + "\n")
def get_seq_with_max_average_blast_score_MDebug(taxon_fasta_filename, taxon_blast_filename): seqids_to_seq = get_relevant_seqids(taxon_fasta_filename) logger.debug( "Generating dictionary of bitscores between seqs according to %s" % taxon_blast_filename) with open(taxon_blast_filename) as f: dr = csv.DictReader(f, delimiter='\t', fieldnames=[ 'query_id', 'subject_id', 'pct_identity', 'align_len', 'mismatches', 'gap_openings', 'q_start', 'q_end', 's_start', 's_end', 'eval', 'bitscore' ]) max_score_dict = dict() all_seq_ids = [] for row in dr: row_key = get_row_key(row) query_id = row['query_id'] sub_id = row['subject_id'] if query_id not in all_seq_ids: all_seq_ids.append(query_id) if sub_id not in all_seq_ids: all_seq_ids.append(sub_id) #logger.debug("Adding the following key %s" % row_key) if row_key not in max_score_dict: max_score_dict[row_key] = -1.0 score = float(row['bitscore']) if max_score_dict[row_key] < score: max_score_dict[row_key] = score seqid_to_average_score = dict() missing_keys = list() #logger.debug("MDebug : max_score_dict\n") #logger.debug(max_score_dict) for seqid in seqids_to_seq: average_bit_score = 0.0 for other_seqid in all_seq_ids: if seqid != other_seqid: key = get_taxon_gis_key(seqid, other_seqid) if key not in max_score_dict: missing_keys.append(key) else: average_bit_score = average_bit_score + max_score_dict[key] average_bit_score /= len(seqids_to_seq) - 1 seqid_to_average_score[seqid] = average_bit_score if len(missing_keys) > 0: logger.error("Didn't find the following keys in blast file %s: %s" % (taxon_blast_filename, ",".join(missing_keys))) max_seqid = None max_average_bitscore = -1 for seqid in seqid_to_average_score: # second check is done in order to make sure this method will always return the same seqid in case there are several seqs with the same average_bitscore if (max_average_bitscore < seqid_to_average_score[seqid]) or ( max_average_bitscore == seqid_to_average_score[seqid] and seqid > max_seqid): max_average_bitscore = seqid_to_average_score[seqid] max_seqid = seqid logger.info( "Max average bitscore is %f for %s .Found the following average bit scores per GI %s" % (max_average_bitscore, max_seqid, seqid_to_average_score)) return seqids_to_seq[max_seqid]
try: input_file = sys.argv[1] start_date = sys.argv[2] csv_column = sys.argv[3] output_path = os.path.dirname(input_file) current_date = datetime.date.fromisoformat(start_date) cases_relation = {} counter = 0 cases_per_day = 0 check_constraint = 0 deaths_per_day = 0 with open(input_file, newline='') as f: reader = csv.DictReader(f) for row in reader: counter += 1 print("Checking case #{0}".format(counter), end='\r') if (row['FECHA_INGRESO'] == "{0:%d}/{0:%m}/{0:%y}".format( current_date)): if (row['RESULTADO'] == '1'): cases_per_day += 1 # A number one means that the constraint asserts to true if (row[csv_column] == '1'): check_constraint += 1 if (row['FECHA_DEF'] != '9999-99-99'): deaths_per_day += 1 else: print("\nChecked {0}".format(row['FECHA_INGRESO']))
import csv db = [] with open("libri.csv", newline="") as dbfile: dbreader = csv.DictReader(dbfile, delimiter=";") for row in dbreader: db.append(dict(row)) # Exercice 1 def cle_date(d): return d["date"] def ex1(): return sorted(db, key=cle_date) # Exercice 2 def ex2(): genres = [] for entry in db: if (entry["genre"] not in genres): genres.append(entry["genre"]) return genres # Exercice 3 def ex3(): titres = [] for entry in db: if (int(entry["date"]) < 1820): titres.append(entry["titre"]) return titres
con = sqlite3.connect("safety_harbor.db") cur = con.cursor() #drop tables if they exists so we do not insert repeat data for tablename in table_list: stmt = "DROP TABLE IF EXISTS " + tablename cur.execute(stmt) con.commit() # create nodes table cur.execute("CREATE TABLE IF NOT EXISTS nodes (id, lat, lon, user, uid, version, changeset, timestamp);") # load table with codecs.open('nodes.csv', encoding='utf-8-sig') as fin: dr = csv.DictReader(fin) pprint.pprint(dr.fieldnames) to_db = [(i['id'], i['lat'], i['lon'], i['user'], i['uid'], i['version'], i['changeset'], i['timestamp']) for i in dr] cur.executemany("INSERT INTO nodes (id, lat, lon, user, uid, version, changeset, timestamp) \ VALUES (?, ?, ?, ?, ?, ?, ?, ?);", to_db) # create nodes_tags table cur.execute("CREATE TABLE IF NOT EXISTS nodes_tags (id, key, value, type);") # load table with codecs.open('nodes_tags.csv', encoding='utf-8-sig') as fin: dr = csv.DictReader(fin) pprint.pprint(dr.fieldnames) to_db = [(i['id'], i['key'], i['value'], i['type']) for i in dr]
cmdout=os.system('zcat < '+work_dir+'threatmetrix_payer_'+str(day)+'.csv.gz | head -1 > '+work_dir+'threatmetrix_payer_'+str(day)+'_sorted.csv') cmdout=os.system('zcat < '+work_dir+'threatmetrix_payer_'+str(day)+'.csv.gz | sed 1d | LC_ALL=C sort -t, -k1,1 >> '+work_dir+'threatmetrix_payer_'+str(day)+'_sorted.csv') cmdout=os.system('gzip '+work_dir+'threatmetrix_payer_'+str(day)+'_sorted.csv') header_out=['payment_request_id']+signal_names output_file=work_dir+"threatmetrix_payer_flat_"+str(day)+".csv.gz" outfile=gzip.open(output_file,'w') outcsv=csv.DictWriter(outfile, fieldnames=header_out) outcsv.writeheader() input_file=work_dir+"threatmetrix_payer_"+str(day)+"_sorted.csv.gz" infile=gzip.open(input_file,'rb') incsv=csv.DictReader(infile) row_flat = {} payment_request_id='' nRow=0 nPayment=0 for row in incsv: if not is_number(row['payment_request_id']) or row['payment_request_id'] == '': print "key not valid: ",row['payment_request_id'] continue # tell if reach new payment if row['payment_request_id'] != payment_request_id: #output last row if not first time payment_request_id if nRow != 0: #print row_flat outcsv.writerow(row_flat)
from flask import Flask app = Flask(__name__) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--host", type=str) parser.add_argument("--port", type=int) parser.add_argument("--file", type=str) args = parser.parse_args() response = {} with open(args.file, encoding="utf8") as csvfile: reader = csv.DictReader(csvfile, delimiter=';', quotechar='"') for creature in reader: specie = creature["creature_type"] if specie not in response: response[specie] = { "habitats": set(), "colors": set(), "heights": set(), "feeds": set() } response[specie]["habitats"].add( creature["habitat"] ) response[specie]["colors"].add(
def read_from_csv_dict(): with open('data.csv', 'rt') as f: csvin = csv.DictReader(f, fieldnames=['first', 'last']) for row in csvin: print row
def __initialize_clients_from_storage(): with open(CLIENT_TABLE, mode='r') as f: reader = csv.DictReader(f, fieldnames=CLIENTS_SCHEMA) for row in reader: clients.append(row)
def readcsv(fname): """Reads the CSV file given and returns a list of dicts""" import csv reader = csv.DictReader(open(fname)) ret = [row for row in reader] return ret
def vqa_v2_generator(self, data_dir, tmp_dir, datasets): """VQA v2 generator using image features.""" _get_vqa_v2_annotations(tmp_dir, self._VQA_V2_ANNOTATION_URL) _get_vqa_v2_image_feature_dataset(tmp_dir, self._VQA_V2_FEATURE_URL) vocab_path = os.path.join(data_dir, self.vocab_filename) if not tf.gfile.Exists(vocab_path): vocab_tmp_path = os.path.join(tmp_dir, self.vocab_filename) tf.gfile.Copy(vocab_tmp_path, vocab_path) with tf.gfile.GFile(vocab_path, mode="r") as f: vocab_data = "<pad>\n<EOS>\n" + f.read() + "UNK\n" with tf.gfile.GFile(vocab_path, mode="w") as f: f.write(vocab_data) label_path = os.path.join(data_dir, self.label_filename) if not tf.gfile.Exists(label_path): label_tmp_path = os.path.join(tmp_dir, self.label_filename) tf.gfile.Copy(label_tmp_path, label_path) vocab_encoder = text_encoder.TokenTextEncoder(vocab_path, replace_oov="UNK") label_encoder = text_encoder.ClassLabelEncoder( class_labels_fname=label_path) # merge annotations annotation_json = [] for _, annotation_file in datasets: annotation_path = os.path.join(tmp_dir, annotation_file) with tf.gfile.Open(annotation_path) as f: annotation_json += json.loads(f.read()) annotation_count = len(annotation_json) tf.logging.info("Processing %d annotations for vqa v2" %(annotation_count)) imageid2annotation = {} for anno in annotation_json: if anno["image_id"] not in imageid2annotation: imageid2annotation[anno["image_id"]] = [anno] else: imageid2annotation[anno["image_id"]].append(anno) csv.field_size_limit(sys.maxsize) for feature_file, _ in datasets: feature_file_path = os.path.join(tmp_dir, feature_file) with open(feature_file_path, "r+b") as tsv_file: csv_reader = csv.DictReader( tsv_file, delimiter="\t", fieldnames=self.feature_file_field_names) for item in csv_reader: item["num_boxes"] = int(item["num_boxes"]) image_id = int(item["image_id"]) image_w = float(item["image_w"]) image_h = float(item["image_h"]) bboxes = np.frombuffer(base64.decodestring(item["boxes"]), dtype=np.float32).reshape( (item["num_boxes"], -1)) box_width = bboxes[:, 2] - bboxes[:, 0] box_height = bboxes[:, 3] - bboxes[:, 1] scaled_width = box_width / image_w scaled_height = box_height / image_h scaled_x = bboxes[:, 0] / image_w scaled_y = bboxes[:, 1] / image_h box_width = box_width[..., np.newaxis] box_height = box_height[..., np.newaxis] scaled_width = scaled_width[..., np.newaxis] scaled_height = scaled_height[..., np.newaxis] scaled_x = scaled_x[..., np.newaxis] scaled_y = scaled_y[..., np.newaxis] spatial_features = np.concatenate( (scaled_x, scaled_y, scaled_x + scaled_width, scaled_y + scaled_height, scaled_width, scaled_height), axis=1) if image_id in imageid2annotation: for anno in imageid2annotation[image_id]: question = vocab_encoder.encode(anno["question"]) answer = [label_encoder.encode(ans) for ans in anno["answer"]] answer = answer if answer else [0] # 0 indicates padding yield { "image/feature": np.frombuffer(base64.decodestring(item["features"]), dtype=np.float32).tolist(), "image/spatial_feature": spatial_features.flatten().tolist(), "image/height": [image_h], "image/width": [image_w], "image/bboxes": bboxes.flatten().tolist(), "image/image_id": [image_id], "image/question_id": [anno["question_id"]], "image/question": question, "image/answer": answer, } del imageid2annotation[image_id] # assert all annotations are included assert not imageid2annotation