def store_lines(kw): process_name = multiprocessing.current_process().name db_index = kw.get('job_index', 0) pipe = writer_function options = dict( max_count=kw.get('max_count', 100000), iter_line=caller_function, as_set=True, pipe=pipe, row_index=kw.get('row_index', 1), keep_sample=kw.get('sample', False), byte_start=kw.get('byte_start', None), byte_end=kw.get('byte_end', None), ) try: d, sample = parse_csv(**options) print('Finished relations. Count: {}. Writing file'.format(len(d))) except Exception as e: print('Error on "{}" :'.format(process_name), e) print(traceback.format_exc()) d = [] # pipe.execute() return list(d)
def summarize(filename): orig, parsed = list(parse.parse_csv(filename)) hydro = list(row.mw_available.hydro for row in parsed) needed = list(row.mw_available.total for row in parsed) # amount of additional power needed to generate shortfall = list(n - h for n, h in zip(needed, hydro)) print_summary_stats('needed power', needed) print_summary_stats('hydro', hydro) print_summary_stats('hydro shortfall', shortfall)
def main(): users = parse_csv("anonwhipdata.csv") jitter = Jitter(users) jitter.plot_nom_rate("type") jitter.plot_nom_rate("insurance_group", insurance_group) jitter.plot_nom_rate("doors") jitter.plot_nom_rate("seats") jitter.plot_nom_rate("fuel", lambda value: value == "heavy oil" and "diesel" or value) jitter.plot_nom_rate("transmission", lambda value: value and "automatic" in value and "automatic" or value or "None") jitter.plot_nom_rate("engine_cc", int_slop(slop=500, top=10000)) jitter.plot_nom_rate("engine_co2", int_slop(slop=30)) jitter.plot_nom_rate("has_photos") jitter.plot_nom_rate("reg_date", date_to_age) jitter.plot_nom_rate("join_date", date_to_age_small) jitter.plot_nom_rate("engine_cc", int_slop(slop=200, top=10000), filters=[Filter("car_type", ["car.hatchback"])]) jitter.plot_nom_rate("engine_cc", int_slop(slop=200, top=10000), filters=[Filter("insurance_group", [10, 3])])
def main(): (init, hours) = parse.parse_csv(open(sys.argv[1], 'r')) writer = csv.writer(open(sys.argv[2], "w")) season = config.get_season(hours[0]) nuclear = init[-1].mw_drawn.nuclear def target_nuclear(inrow): # This is a control system to decide how much nuclear power we want. # Using the provided numbers, hydro is optimal. # But nuclear is second-best, and nuclear power supply is inelastic. # So we want to use lots of nuclear, but only after we use as much # hydro as possible. # If we're using 80% as much power this week, adjust estimates down. adjust_factor = max(1, inrow.mw_available.total / inrow.historical_drawn[0]) predicted_drawn = [drawn * adjust_factor for drawn in inrow.historical_drawn] avg_draw = sum(predicted_drawn)/len(predicted_drawn) # Hydro appears to be pretty stable. predicted_needed = max(0, avg_draw - inrow.mw_available.hydro) print('Aiming for {} nuclear power'.format(predicted_needed)) return predicted_needed value_func = {'cost': -1, 'co2': -2, 'green': 0} if len(sys.argv) > 3: value_func = { 'cost': float(sys.argv[3]), 'co2': float(sys.argv[4]), 'green': float(sys.argv[5]), } outrows = [] for hour in hours: rate = config.consumer_rate(season, hour.time) nuclear = clamp(target_nuclear(hour), nuclear * 0.99, nuclear*1.01) power_row, sold = optimizer.optimize(hour, nuclear, value_func) outrow = gen_outrow(hour, power_row, sold, rate) writer.writerow(outrow.to_row()) outrows.append(outrow) print_summary(outrows)
def main(): args = parser.parse_args() json_data = csv_parser.parse_csv( csv_file=args.csv_file, year=args.year, schema_file=args.schema_file, schema_name=args.schema_name, schema_version=args.schema_version, csv_schema_mapping=args.csv_schema_mapping, company_registry=args.company_registry) py_data = json.loads(json_data) result = issue_credentials(environment=args.environment, url=args.url, issuer_key=args.issuer_key, data=py_data) print(json.dumps(json.loads(result), indent=4))
def main(): users = parse_csv("anonwhipdata.csv") jitter = Jitter(users) jitter.plot_nom_rate("type") jitter.plot_nom_rate("insurance_group", insurance_group) jitter.plot_nom_rate("doors") jitter.plot_nom_rate("seats") jitter.plot_nom_rate( "fuel", lambda value: value == "heavy oil" and "diesel" or value) jitter.plot_nom_rate( "transmission", lambda value: value and "automatic" in value and "automatic" or value or "None") jitter.plot_nom_rate("engine_cc", int_slop(slop=500, top=10000)) jitter.plot_nom_rate("engine_co2", int_slop(slop=30)) jitter.plot_nom_rate("has_photos") jitter.plot_nom_rate("reg_date", date_to_age) jitter.plot_nom_rate("join_date", date_to_age_small) jitter.plot_nom_rate("engine_cc", int_slop(slop=200, top=10000), filters=[Filter("car_type", ["car.hatchback"])]) jitter.plot_nom_rate("engine_cc", int_slop(slop=200, top=10000), filters=[Filter("insurance_group", [10, 3])])
yvals.append(yval) print "x: " + str(xval) + ", y: " + str(yval) return xvals, yvals #takes x,y values, plots them # also x,y -> angles -> xy & plots def plotIKcircle(temp): xvals, yvals = temp plt.plot(xvals, yvals, 'rd') #blue squares thetavals = [get_angles(tempx, tempy) for tempx, tempy in zip(xvals, yvals)] for i in range(len(thetavals)): print thetavals[i] IKvals = [angles2xy(theta1, theta2) for theta1, theta2 in thetavals] tempx2, tempy2 = zip(*IKvals) plt.axis('equal') plt.plot(tempx2, tempy2, 'b.') #print "x: " + str(tempx2) + ", y: " + str(tempy2) def plot_theta_vals(thetavals): IKvals = [angles2xy(theta1, theta2) for theta1, theta2 in thetavals] tempx2, tempy2 = zip(*IKvals) plt.axis('equal') plt.plot(tempx2, tempy2, 'ro') thetavals = parse.parse_csv() plot_theta_vals(thetavals) plotworkingenvelope() #plotIKcircle(circle(-100,0)) plt.show()
# Imported modules import parse as p f = input("Please input a file. > ") cell = int( input( "Which column contains the data you would like counted? e.g.[1,2,3...] > " )) - 1 parsed = p.parse_csv(f) data = p.pull(parsed, cell) print(p.items(data))
configs = { fn.strip("/."): read_config(fn) for fn in folder_names if read_config(fn) } # Get the build context API_KEY = os.environ.get('GM_API_KEY') API_KEY_DEV = os.environ.get('GM_API_KEY_DEV') context = {'GM_API_KEY': API_KEY, "map_configs": configs} print("BUILDING WITH CONTEXT") print(json.dumps(context, indent=4)) # Build the home index.html env.get_template('home.html').stream(context).dump("index.html") # Build the admin index.html env.get_template('admin.html').stream(context).dump("draw/index.html") # Build each of the map folders (in order of folder name alphabetically) for dir_name, config in configs.items(): # Make the data.json file if len(sys.argv) <= 1: parse_csv(f"{dir_name}/data.csv", config, API_KEY_DEV) # Get the template context context.update({"config": config}) # Render the template to an index.html env.get_template('map.html').stream(context).dump( f"{dir_name}/index.html")
from parse import parse_csv from predict import RegDatePredictor, InsuranceGroupPredictor, RegDateInsPredictor, AllPredictor, TypePredictor, EngineCCPredictor, EngineCO2Predictor, EngineBothPredictor import math users = parse_csv("anonwhipdata.csv") predictors = [p(users) for p in [RegDatePredictor, InsuranceGroupPredictor, RegDateInsPredictor, AllPredictor, TypePredictor, EngineCCPredictor, EngineCO2Predictor, EngineBothPredictor]] sqdiffs = [0] * len(predictors) matches = [0] * len(predictors) sqcounts = [0] * len(predictors) for user in users: if user.monthly_rate: for i, predictor in enumerate(predictors): result = predictor.predict(user) if result: difference = user.monthly_rate - result.price sqdiffs[i] += difference ** 2 sqcounts[i] += 1 matches[i] += result.matches for i, predictor in enumerate(predictors): print "%- 25s %0.2f % 5s" % (predictor.__class__.__name__, math.sqrt(sqdiffs[i] / sqcounts[i]), matches[i] / sqcounts[i])
from pptx import Presentation from pptx.util import Inches from parse import parse_csv, bonus_csv from slides import fill_slide, bonus_card presentation = Presentation() presentation.slide_width = Inches(2.5) presentation.slide_height = Inches(3.5) cards = parse_csv('cards.csv') bonus = bonus_csv('bonus.csv') def add_slide_from_card(c): layout = presentation.slide_layouts[6] slide = presentation.slides.add_slide(layout) fill_slide(slide, c) def add_bonus_card(c): layout = presentation.slide_layouts[6] slide = presentation.slides.add_slide(layout) bonus_card(slide, c) for card in cards: add_slide_from_card(card) for card in bonus: add_bonus_card(card) presentation.save('cards.pptx')
#takes x,y values, plots them # also x,y -> angles -> xy & plots def plotIKcircle(temp): xvals, yvals = temp plt.plot(xvals, yvals, 'rd') #blue squares thetavals = [ get_angles(tempx, tempy) for tempx, tempy in zip(xvals, yvals) ] for i in range(len(thetavals)): print thetavals[i] IKvals = [angles2xy(theta1, theta2) for theta1, theta2 in thetavals] tempx2, tempy2 = zip(*IKvals) plt.axis('equal') plt.plot(tempx2, tempy2, 'b.') #print "x: " + str(tempx2) + ", y: " + str(tempy2) def plot_theta_vals(thetavals): IKvals = [angles2xy(theta1, theta2) for theta1, theta2 in thetavals] tempx2, tempy2 = zip(*IKvals) plt.axis('equal') plt.plot(tempx2, tempy2, 'ro') thetavals = parse.parse_csv() plot_theta_vals(thetavals) plotworkingenvelope() #plotIKcircle(circle(-100,0)) plt.show()
from parse import parse_csv from predict import RegDatePredictor, InsuranceGroupPredictor, RegDateInsPredictor, AllPredictor, TypePredictor, EngineCCPredictor, EngineCO2Predictor, EngineBothPredictor import math users = parse_csv("anonwhipdata.csv") predictors = [ p(users) for p in [ RegDatePredictor, InsuranceGroupPredictor, RegDateInsPredictor, AllPredictor, TypePredictor, EngineCCPredictor, EngineCO2Predictor, EngineBothPredictor ] ] sqdiffs = [0] * len(predictors) matches = [0] * len(predictors) sqcounts = [0] * len(predictors) for user in users: if user.monthly_rate: for i, predictor in enumerate(predictors): result = predictor.predict(user) if result: difference = user.monthly_rate - result.price sqdiffs[i] += difference**2 sqcounts[i] += 1 matches[i] += result.matches for i, predictor in enumerate(predictors): print "%- 25s %0.2f % 5s" % (predictor.__class__.__name__, math.sqrt(sqdiffs[i] / sqcounts[i]), matches[i] / sqcounts[i])
def train_and_test(data_dir, baseline='neg'): pos_samples = parse_csv('{}/pos.csv'.format(data_dir)) neg_samples = parse_csv('{}/neg.csv'.format(data_dir)) pos_samples_l = [pos_sample + [1] for pos_sample in pos_samples] neg_samples_l = [neg_sample + [0] for neg_sample in neg_samples] all_samples_labelled = pos_samples_l + neg_samples_l X = np.array([sample[:-1] for sample in all_samples_labelled]) y = np.array([sample[-1] for sample in all_samples_labelled]) classifiers = [] res = {} # Dictionary for results # LIST OF CLASSIFIERS USED, SEE USED PARAMETERS IN validation.py # Seed (random_state) fixed to 23 (chosen arbitrarily) for reproducibility classifiers.append((None, 'Baseline')) model1 = xgboost.XGBClassifier(random_state=23) classifiers.append((model1, 'XGB')) model2 = svm.SVC(probability=True, random_state=23) classifiers.append((model2, 'SVC')) model3 = tree.DecisionTreeClassifier(random_state=23) classifiers.append((model3, 'Decision Tree')) model4 = RandomForestClassifier(random_state=23) classifiers.append((model4, 'Random Forest')) model5 = GaussianNB() classifiers.append((model5, 'Gaussian Naive Bayes')) model6 = SGDClassifier(random_state=23) classifiers.append((model6, 'SGD')) model7 = KNeighborsClassifier() classifiers.append((model7, 'K-neighbors')) model8 = BaggingClassifier(tree.DecisionTreeClassifier(random_state=23), random_state=23) classifiers.append((model8, 'Bagged Decision Trees 1')) model9 = BaggingClassifier(tree.DecisionTreeClassifier(random_state=23), random_state=23) classifiers.append((model9, 'Bagged Decision Trees 2')) model10 = ExtraTreesClassifier(random_state=23) classifiers.append((model10, 'Extra Trees')) model11 = AdaBoostClassifier(random_state=23) classifiers.append((model11, 'AdaBoost 1')) model12 = AdaBoostClassifier(random_state=23) classifiers.append((model12, 'AdaBoost 2')) model13 = GradientBoostingClassifier(random_state=23) classifiers.append((model13, 'Gradient Boosting')) # Random train and test splits using 80% as training set and 20% as test set # Seed (random_state) fixed to 23 (chosen arbitrarily) for reproducibility X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23) opt_classifiers = validate(X_train, y_train, classifiers) for clf_tup in opt_classifiers: if clf_tup[1] == 'Baseline': # Baseline prediction, all positive or all negative # Always negative in our case since there are fewer SBRs than NSBRs y_pred = [ 0 if baseline == 'neg' else 1 for j in range(len(X_test)) ] else: clf = clf_tup[ 0] # Already tuned classifier (parameters in validation.py) clf.fit(X_train, y_train) # TRAINING y_pred = clf.predict(X_test) # TESTING y_score = np.array([ p[1] for p in clf.predict_proba(X_test) ]) # Probabilities for precision / recall curve cm = confusion_matrix(y_test, y_pred) clf_name = clf_tup[ 1] # Classifier name to identify which classifier is used res[clf_name] = {} res[clf_name]['TN'] = cm[0][0] res[clf_name]['FP'] = cm[0][1] res[clf_name]['FN'] = cm[1][0] res[clf_name]['TP'] = cm[1][1] res[clf_name]['acc'] = accuracy_score(y_test, y_pred) if clf_tup[1] == 'Baseline': if baseline == 'neg': res[clf_name]['prec'] = float( 'NaN') # no true positives, no false positives res[clf_name]['rec'] = 0.0 # no true positives res[clf_name]['fsco'] = float('NaN') else: res[clf_name]['prec'] = cm[1][1] / (cm[1][1] + cm[0][1] ) # TP / (TP + FP) res[clf_name]['rec'] = 1.0 # no false negatives res[clf_name]['fsco'] = 2 / (1 + 1 / res[clf_name]['prec'] ) # Harmonic mean else: prec, rec, fsco, _ = precision_recall_fscore_support( y_test, y_pred, pos_label=1, average='binary') res[clf_name]['prec'] = prec # precision computed by scikit-learn res[clf_name]['rec'] = rec # recall computed by scikit-learn res[clf_name]['fsco'] = fsco # F-score computed by scikit-learn average_precision = average_precision_score( y_test, y_score) # average precision computed by scikit-learn precision, recall, _ = precision_recall_curve( y_test, y_score) # Precision recall values to be plotted # PLOTTING PRECISION RECALL CURVE step_kwargs = ({'step': 'post'}) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('{}: 2-class Precision-Recall curve: AP={:0.2f}'.format( clf_name, average_precision)) # plt.show() # Uncomment to show plots # Displaying results print('\n{:32s} {:4s} {:4s} {:4s} {:4s} {:10s}{:10s} {:10s}{:10s}'. format('', 'TP', 'FP', 'FN', 'TN', 'Accuracy', 'Precision', 'Recall', 'F-score')) for clf in res.keys(): curr = res[clf] print('{:30s} {:4d} {:4d} {:4d} {:4d} {:10f} {:10f} {:10f} {:10f}'. format(clf, curr['TP'], curr['FP'], curr['FN'], curr['TN'], curr['acc'], curr['prec'], curr['rec'], curr['fsco']))
subcategories = [E.category( E.id(subcategory.id), E.title(subcategory.title) ) for subcategory in programme['subcategories'] if subcategory.parent is category] if subcategories: category_node.append(E.categories(*subcategories)) node.append(category_node) return node programme_dicts, categories, subcategories = parse_csv() for programme in programme_dicts: with open('xml/{}.xml'.format(programme['pid']), 'w') as xml_file: xml = ET.tostring( E.programme( E.pid(programme['pid']), E.complete_title( unicode( programme['complete_title'], 'utf-8' )), E.media_type(programme['media_type']), E.masterbrand(programme['masterbrand']), E.brand_pid(programme['brand_pid']), E.is_clip(programme['is_clip']),
def test_parse_csv(self): for filename in self.FILES: with open(filename) as csv_file: parse.parse_csv(csv_file)