def build_data(self, classes, data_path): self.classes = sorted(list(map(lambda l: l.lower(), classes))) data = pd.read_csv(data_path).replace({np.nan: None}) items = [] for i, row in data.iterrows(): label = row['Type'].lower() if label not in self.classes: continue sender = self.__get_sender_str(row['Sender']) subject = util.clean(row['Subject']) try: text = util.clean(row['Text']) except: print(row) raise Exception unsubscribe = self.__get_unsubscribe_str(row['Unsubscribe']) extensions = self.__get_extensions_str(row['Files']) items.append({ 'sender': sender, 'subject': subject, 'text': text, 'unsubscribe': unsubscribe, 'extensions': extensions, 'type': label }) self.train_data = pd.DataFrame(items) self.train_data.to_csv(os.path.join(os.path.dirname(data_path), 'train.csv'), index=False)
def take_control(self): """ 1) Detect candidate Entities from current location. 2) Examine entities to get detailed descriptions 3) Extract nested entities from detailed descriptions """ obs = yield curr_loc = kg.player_location undescribed_entities = self.get_descriptionless_entities() if undescribed_entities: entity = undescribed_entities[0] action = gv.Examine(entity.name) response = yield action entity.description = response p_valid = self._valid_detector.action_valid(action, response) dbg("[EXM] p={:.2f} {} --> {}".format(p_valid, action, clean(response))) curr_loc.add_action_record(action, 1., response) else: entity_name = self._to_examine[curr_loc].pop() action = gv.Examine(entity_name) response = yield action p_valid = self._valid_detector.action_valid( action, first_sentence(response)) success = (p_valid > self._validation_threshold) self.record(success) dbg("[EXM]({}) p={:.2f} {} --> {}".format( "val" if success else "inv", p_valid, action, clean(response))) curr_loc.add_action_record(action, p_valid, response) if success: entity = curr_loc.get_entity_by_description(response) if entity is None: entity = Entity(entity_name, curr_loc, description=response) # TODO: incorrect for entities discovered inside other entities curr_loc.add_entity(entity) else: dbg("[EXM](val) Discovered alternate name "\ "\'{}\' for \'{}\'".format(entity_name, entity.name)) entity.add_name(entity_name) if success: entity = curr_loc.get_entity_by_description(response) inv_entity = kg.inventory.get_entity_by_description(response) if entity is None and inv_entity is None: entity = Entity(entity_name, curr_loc, description=response) # TODO: incorrect for entities discovered inside other entities curr_loc.add_entity(entity) else: if entity: dbg("[EXM](val) Discovered alternate name " \ "\'{}\' for \'{}\'".format(entity_name, entity.name)) entity.add_name(entity_name) if inv_entity: dbg("[EXM](val) Discovered alternate name " \ "\'{}\' for inventory item \'{}\'".format(entity_name, inv_entity.name)) inv_entity.add_name(entity_name)
def build_text_matrix_no_cols(schema): # answers schema rows = {row['id']: clean(row['text']) for row in schema['rows']} choices = { choice['id']: clean(choice['text']) for choice in schema['choices'] } return rows, choices
def main(): """ Looks for files in this format: <artist> - <title>.mp3 Writes <artist> and <title> as ID3 tag in the file. """ parser = argparse.ArgumentParser(description='Rename and tag audio files.') parser.add_argument("-o", "--omit-youtube-id", action="store_true", help="The files don't have a youtube-ID in their name.") parser.add_argument("-l", "--lazy", action='store_true', help="Don't prompt for every file, assume 'yes' for all.") parser.add_argument("-p", "--pretend", action='store_true', help="Don't do anything, just print what would be done (Assumes '-l').") parser.add_argument("-f", "--filetype", dest="filetype", help="FILETYPE to rename and tag (default: 'mp3').", metavar="FILETYPE", default="mp3") parser.add_argument("-s", "--skip-renaming", action="store_true", help="Skip renaming, just tag.") parser.add_argument("-a", "--album-tag", dest="albumtag", help="Write ALBUMTAG in the id3 album track tag of all files.", metavar="ALBUMTAG", default="") parser.add_argument("directory", help="Directory that holds the audio files.", type=str) args = parser.parse_args() output_folder = "done" full_output_folder = "%s%s%s" % (args.directory, os.sep, output_folder) util.create_output_folder(full_output_folder) if not args.skip_renaming: renamr = Renamr(**vars(args)) renamr.output_folder = full_output_folder renamr.directory = args.directory renamr.rename() for i in os.listdir(args.directory): if i.endswith(".%s" % args.filetype): subset = 1 + len(args.filetype) filename = i[:-subset] splitted = filename.split("-") if len(splitted) < 2: print "[ERROR] Could not process %s" % i continue artist = string.capwords(splitted.pop(0)) title = string.capwords(" - ".join(splitted)) artist = re.sub(' +', ' ', artist) title = re.sub(' +', ' ', title) full_filename = "%s%s%s" % (args.directory, os.sep, i) id3info = eyed3.load(full_filename) if id3info.tag == None: id3info.initTag() id3info.tag.title = unicode(title) id3info.tag.artist = unicode(artist) if len(args.albumtag) > 0: id3info.tag.album = unicode(string.capwords(args.albumtag)) id3info.tag.save() util.clean(full_output_folder)
def getTrackFromTracklist(self, url, artist, track): r = requests.get(self.wwwToApi(url)) cloudcast = json.loads(u.toUtf8(r.text)) if ('sections' in cloudcast): for section in cloudcast['sections']: if ('track' in section): tracklistArtistName = section['track']['artist']['name'] if (u.clean(tracklistArtistName) == u.clean(artist)): trackName = self.getFullName(section['track']['artist']['name'], section['track']['name']) return trackName return None
async def match_label(labels_url, control_name, headers, page=1): "recursively search for a label that matches the control_name parameter" page_number = re.search(r"page=(\d+)?", labels_url).group(1) url = labels_url.replace(page_number, str(page)) labels = await get_api_data(url, headers) for label in labels: if clean(label["name"]) == clean(control_name): return label # if we are at the last label on the page and no match is found, request the next page of labels if labels.index(label) == len(labels) - 1: match = await match_label(url, control_name, headers, page + 1) return match return None
def main(argv): skip = [] workdir = os.getcwd() proj = None bug = None onlyFailing = False onlyRelevant = False onlyTest = None mem = None try: opts, args = getopt.getopt(argv, "hp:b:w:s:t:m:fr") except getopt.GetoptError as e: print "Error in your arguments: ", e show_help() sys.exit(2) for opt, arg in opts: if opt == '-h': show_help() sys.exit() elif opt == '-p': proj = arg elif opt == '-b': bug = arg elif opt == '-t': onlyTest = arg elif opt == '-f': onlyFailing = True elif opt == '-r': onlyFailing = True elif opt == '-w': workdir = arg elif opt == '-s': skip = arg.split(',') elif opt == '-m': mem = arg if proj == None or bug == None: print "Please specify a project and bug number\n" show_help() sys.exit(2) if not 'run' in skip: util.clean(workdir) if not defects4j.from_cache(workdir, proj, bug): defects4j.checkout(workdir, proj, bug) defects4j.run(workdir, proj, bug, onlyFailing, onlyRelevant, onlyTest) defects4j.cache(workdir, proj, bug) if not 'falo' in skip: falo.run(workdir, proj, bug, mem) if not 'after' in skip: falo.keep_interesting_graphs(workdir, proj, bug)
def run_ica_averaged(df, labels): # Now do ICA on the mean of the signals . Recreate the data df = df_post_outlier.copy() df = util.filter_countries_lt_n_samples(df, 5, sample_lookup) df = util.clean(df) print("ICA shape", df.shape) df_countries = df.join(labels) df_country_means = df_countries.groupby('label').mean() df_country_means = util.clean(df_country_means) M = util.build_matrix(df_country_means) ICA_projection = util.do_ica(M, M.shape[0]) np.savetxt("{}/results/ICA_projection.dat".format(dir_path), ICA_projection) pd.DataFrame(df_country_means.index).to_csv( '{}/results/ICA_countries.csv'.format(dir_path))
def execute(args): np.random.seed(42) if len(args) < 1: usage() sys.exit() names, y, x = parse(args[0]) indices = [int(i) for i in args[1:]] relevant_names = names[1:] x = clean(relevant_names, x) if len(indices) > 0: x = [[sample[i] for i in indices] for sample in x] relevant_names = [relevant_names[i] for i in indices] print "Clustering on", str(relevant_names) + "..." labels = np.unique(y) kmeans = KMeans(n_clusters= CLUSTER_FACTOR * len(labels), random_state=0) y_pred = kmeans.fit_predict(x) counts = get_cluster_counts(y, y_pred) totals = [0] * len(counts) print counts for i, mapping in counts.iteritems(): totals[i] = sum(mapping.values()) finals = get_final_mapping(counts, totals) if len(finals) < len(labels): print "WARNING: Not all clusters unique!" print "FINAL CLUSTERS", finals print print "NUM LABELS", len(labels) print "ACCURACY", accuracy(finals, labels) return accuracy(finals, labels)
def submit_blast(): bundles = get_bundles("old") app.logger.info(pformat(request.form)) form = BlastForm(request.form) email_is_valid = validate_email(request.form["stripeEmail"]) amount = request.form["amount"] if email_is_valid: customer = stripe.Customer.create(email=request.form["stripeEmail"], card=request.form["stripeToken"]) app.logger.info(f"Customer id: {customer.id}") else: message = "There was an issue saving your email address." return render_template("error.html", message=message, bundles=bundles) if form.validate(): app.logger.info("----Adding Blast subscription...") add_blast_subscription.delay(customer=customer, form=clean(request.form)) if amount == "349": event_label = "annual" elif amount == "40": event_label = "monthly" elif amount == "325": event_label = "annual tax exempt" gtm = {"event_value": amount, "event_label": event_label} return render_template("blast-charge.html", bundles=bundles, gtm=gtm) else: app.logger.error("Failed to validate form") message = "There was an issue saving your donation information." return render_template("error.html", message=message, bundles=bundles)
def do_charge_or_show_errors(template, bundles, function): app.logger.debug("----Creating Stripe customer...") email = request.form["stripeEmail"] installment_period = request.form["installment_period"] amount = request.form["amount"] try: customer = stripe.Customer.create(email=email, card=request.form["stripeToken"]) except stripe.error.CardError as e: body = e.json_body err = body.get("error", {}) message = err.get("message", "") form_data = request.form.to_dict() del form_data["stripeToken"] return render_template( template, bundles=bundles, key=app.config["STRIPE_KEYS"]["publishable_key"], message=message, form_data=form_data, ) app.logger.info(f"Customer id: {customer.id}") function(customer=customer, form=clean(request.form)) gtm = { "event_value": amount, "event_label": "once" if installment_period == "None" else installment_period, } return render_template("charge.html", gtm=gtm, bundles=get_bundles("charge"))
def test__clean(): form = { "a": "None", "b": "True", "c": "False", "d": "None", "e": "none", "f": None, "g": True, "h": False, "i": 9, "j": 8.1, "k": "3.2", "l": "4", "m": "string", } expected = { "a": None, "b": True, "c": False, "d": None, "e": "none", "f": None, "g": True, "h": False, "i": 9, "j": 8.1, "k": 3.2, "l": 4, "m": "string", } actual = clean(form) assert expected == actual assert actual["bogus"] is None
def get_init_population(self): # load/create db fname = os.path.join(self.init_pop_dir, 'init_data.pickle') if os.path.isfile(fname): with open(fname, 'rb') as f: self.db = pickle.load(f) else: self.db = self.eval_core.generate_data_set(self.n_init_samples, evaluate=True) with open(fname, 'wb') as f: pickle.dump(self.db, f) if len(self.db) >= self.n_init_samples: random.shuffle(self.db) self.db = self.db[:self.n_init_samples] else: raise Warning( 'Number of init_samples is larger than the length of the ' 'initial data base, using the len(db) instead of n_init_samples' ) self.db = clean(self.db, self.eval_core) self.db = relable(self.db, self.eval_core) self.db = sorted(self.db, key=lambda x: x.cost) # HACK for paper # self.db = self.db[1:] self.logger.log_text("[INFO] Best cost in init_pop = {}".format( self.db[0].cost))
def main(): parser = OptionParser(usage="usage: %prog [--input tweets.csv] [--output tweets_clean.txt] [--column tweet]", version="%prog 1.0") parser.add_option("-i", "--input", dest="input", default="tweets.csv", help="the input CSV file",) parser.add_option("-c", "--column", dest="column", default="tweet", help="the CSV column to be cleaned and saved",) parser.add_option("-o", "--output", dest="output", default="tweets_clean.txt", help="the output text file to be saved",) (options, args) = parser.parse_args() with open(options.output, "w") as my_output_file: with open(options.input, "r") as my_input_file: # one line method #[my_output_file.write("".join(util.clean(row[options.column]))+'\n') for row in csv.DictReader(my_input_file)] # with progress bar size = sum(1 for row in csv.DictReader(my_input_file)) my_input_file.seek(0) reader = csv.DictReader(my_input_file) bar = Bar('Processing', max=size) for row in reader: my_output_file.write("".join(util.clean(row[options.column]))+'\n') bar.next() bar.finish() my_output_file.close()
def add_blast_subscription(form=None, customer=None): """ Adds a Blast subscription. Blast subscriptions are always recurring. They have two email addresses: one for billing and one for the newsletter subscription. """ form = clean(form) first_name = form["first_name"] last_name = form["last_name"] email = form["subscriber_email"] logging.info("----Getting contact...") contact = Contact.get_or_create(email=email, first_name=first_name, last_name=last_name) logging.info(contact) rdo = RDO(contact=contact) rdo.stripe_customer = customer["id"] rdo.campaign_id = form["campaign_id"] rdo.referral_id = form["referral_id"] rdo.lead_source = "Stripe" rdo.amount = form.get("amount", 0) rdo.agreed_to_pay_fees = form["pay_fees_value"] # Blast specific: rdo.installments = 0 rdo.description = "Blast Subscription" rdo.open_ended_status = "Open" if int(float(rdo.amount)) == 40: rdo.installment_period = "monthly" else: rdo.installment_period = "yearly" now = datetime.now(tz=ZONE).strftime("%Y-%m-%d %I:%M:%S %p %Z") rdo.name = f"{first_name} {last_name} - {now} - The Blast" rdo.type = "The Blast" rdo.billing_email = form["stripeEmail"] rdo.blast_subscription_email = form["subscriber_email"] logging.info("----Saving RDO....") apply_card_details(rdo=rdo, customer=customer) rdo.save() logging.info(rdo) # get opportunities opportunities = rdo.opportunities() today = datetime.now(tz=ZONE).strftime("%Y-%m-%d") opp = [ opportunity for opportunity in opportunities if opportunity.expected_giving_date == today ][0] try: charge(opp) except ChargeException: # TODO should we alert slack? Did not because we had no notifications here before. pass return True
def execute(args): if len(args) < 1: usage() sys.exit() # Parse data # names == feature labels # x == features that correspond to shuffled names # y == shuffled names names, y, x = parse(args[0]) x = util.clean(names, x) # Runs RFC every combination of pairs of users for a binary classification. # The number of possible class predictions = 2. num_users = len(set(y)) num_combos = np.math.factorial(num_users) / (2 * np.math.factorial(num_users - 2)) print "Testing 2-way combinations of users for binary classification:" print "Number of users: %d" % (num_users) print "Number of combinations: %d" % (num_combos) print print "================================================================================" print "Evaluating Combinations of Users:" print combos = get_binary_user_combinations(y) COMBO_SCORES = {} for c in combos: x_pruned, y_pruned = prune_data(x, y, c) x_train, x_test, y_train, y_test = train_test_split(x_pruned, y_pruned, test_size=TRAIN_PARTITION, random_state=0) # Set up Random Forest Classifier model = RandomForestClassifier( n_estimators=FOREST_SIZE, criterion=CRITERION, max_features=MAX_FEATURES, verbose=VERBOSE, ) model.fit(x_train, y_train) # Updating combination scores if c[0] not in COMBO_SCORES: COMBO_SCORES[c[0]] = {} if c[1] not in COMBO_SCORES: COMBO_SCORES[c[1]] = {} score = average_score_k_trials(model, x_test, y_test, 5) COMBO_SCORES[c[0]][c[1]] = score COMBO_SCORES[c[1]][c[0]] = score print "\tEvaluating users: %35s %f" % (c, score) # Evaluate final results evaluate(COMBO_SCORES, 2)
def execute(args): np.random.seed(42) if len(args) < 1: usage() sys.exit() names, y, x = parse(args[0]) indices = [int(i) for i in args[1:]] relevant_names = names[1:] x = clean(relevant_names, x) if len(indices) > 0: x = [[sample[i] for i in indices] for sample in x] relevant_names = [relevant_names[i] for i in indices] print "Clustering on", str(relevant_names) + "..." labels = np.unique(y) kmeans = KMeans(n_clusters=CLUSTER_FACTOR * len(labels), random_state=0) y_pred = kmeans.fit_predict(x) counts = get_cluster_counts(y, y_pred) totals = [0] * len(counts) print counts for i, mapping in counts.iteritems(): totals[i] = sum(mapping.values()) finals = get_final_mapping(counts, totals) if len(finals) < len(labels): print "WARNING: Not all clusters unique!" print "FINAL CLUSTERS", finals print print "NUM LABELS", len(labels) print "ACCURACY", accuracy(finals, labels) return accuracy(finals, labels)
def analyze(self, n, source): queue = [] for line in self.file: self.numlines += 1 line = util.clean(line) words = queue #Assumes each line in the corpus is a separate 'sentence' for _ in range(n): words.append('-BEGIN-') words.extend(line.split()) # current words to be considered if line.split(): self.begin_map[line.split()[0]] += 1 if ((source == "rap") and ((line == "") or (string.find(line, "verse") != -1) or (string.find(line, "hook") != -1) or (string.find(line, "bridge") != -1))): words = [] queue = [] # reset queue upon reading new line while (len(words) > n): key = [] for i in range(n): key.append(words[i]) k = tuple(key) self.frequency_map[k] += 1 if k not in self.word_map: self.word_map[k] = Counter({words[i + 1]:1}) else: self.word_map[k].update({words[i + 1]: 1}) words.pop(0) [queue.append(word) for word in words] #add leftover words to queue
def execute(args): if len(args) < 1: usage() sys.exit() # Parse data # names == feature labels # x == features that correspond to shuffled names # y == shuffled names names, y, x = parse(args[0]) x = util.clean(names, x) # Runs a multi-class classification using Random Forest. # The number of possible class predictions = number of users. print "Running full multi-class classification:" print "Number of users: %d" % (len(set(y))) print x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=TRAIN_PARTITION, random_state=0) # Set up Random Forest Classifier model = RandomForestClassifier( n_estimators=FOREST_SIZE, criterion=CRITERION, max_features=MAX_FEATURES, verbose=VERBOSE, ) model.fit(x_train, y_train) # Evaluation evaluate(model, x_test, y_test)
def __init__(self, obs, action, score, new_obs, terminal): message = '\"{}\" --> {} Score={}'.format(action, util.clean(new_obs), score) super().__init__(message) self.obs = obs self.action = action self.score = score self.new_obs = new_obs self.terminal = terminal
def predict(self, unsubscribe, sender, subject, text, files): item = { 'unsubscribe': self.__get_unsubscribe_str(unsubscribe), 'sender': self.__get_sender_str(sender), 'subject': util.clean(subject) if subject is not None else '', 'text': util.clean(text) if text is not None else '', 'extensions': self.__get_extensions_str(files) } x = '' for col in self.features_ordered: if item[col] is not None and len(item[col]) > 0: x += str(item[col]).lower() + ' ' x = x.strip() predictions = self.model.predict(np.array([x]))[0] return self.classes[np.argmax(predictions)], predictions, x
def run_ica_not_averaged(df, labels): # Do ICA not averaged df = df_post_outlier.copy() df = util.clean(df) M = util.build_matrix(df) ICA_projection = util.do_ica(M, 30) np.savetxt("{}/results/all_samples_ICA_projection.dat".format(dir_path), ICA_projection) countries = labels.loc[df.index]['label'] pd.DataFrame(countries).to_csv( '{}/results/all_samples_ICA_countries.csv'.format(dir_path))
def process_event(self, event): """ Process an event from the event stream. """ location, message = self.get_event_info(event) if location not in self._to_examine: self._to_examine[location] = [] if not message: return candidate_entities = self.detect_entities(message) dbg("[EXM](detect) {} --> {}".format(clean(message), candidate_entities)) self.filter(candidate_entities)
def process_question(schema_question, question): question_text = clean(schema_question['headings'][0]['heading']) results = None txt_replace_fn = qtype_handlers.get(schema_question['family']) if txt_replace_fn: results = {question_text: txt_replace_fn(schema_question, question)} #print (results) else: print(f"\n\n...........family : {schema_question['family']}\n\n") return results
def observe(self, obs, action, score, new_obs, terminal): """ Observe will be used for learning from rewards. """ p_valid = self._valid_detector.action_valid(action, new_obs) dbg("[VALID] p={:.3f} {}".format(p_valid, clean(new_obs))) if kg.player_location: dbg("[EAGERNESS] {}".format(' '.join( [str(module.get_eagerness()) for module in self.modules[:5]]))) event_stream.push( NewTransitionEvent(obs, action, score, new_obs, terminal)) action_recognized(action, new_obs) # Update the unrecognized words if terminal: kg.reset()
def getNeighboursFromTracklist(self, url, artist, track): r = requests.get(self.wwwToApi(url)) cloudcast = json.loads(u.toUtf8(r.text)) result = [] if ('sections' in cloudcast): sections = cloudcast['sections'] for (i, section) in enumerate(sections): if ('track' in section): tracklistArtistName = section['track']['artist']['name'] tracklistTrackName = section['track']['name'] if (u.clean(tracklistArtistName).find(u.clean(artist)) >= 0 and u.clean(tracklistTrackName).find(u.clean(track)) >= 0): if (i > 0): prev = sections[i - 1] name = u.toName(prev['track']['artist']['name'], prev['track']['name']) result.append(name) if (i < len(sections) - 1): nxt = sections[i + 1] name = u.toName(nxt['track']['artist']['name'], nxt['track']['name']) result.append(name) break return result
def process_matrix_with_cols(matrix_schema, data): rows, cols, cols_choices = build_text_matrix_with_cols( matrix_schema['answers']) results = {} #{'rows': rows.values(), 'cols': cols.values()} for answer in data['answers']: row_id = answer.get('row_id') if not row_id and answer.get('other_id'): results['other'] = answer['text'] continue rtext = rows[row_id] ans_col_id = answer['col_id'] ans_choice_id = answer['choice_id'] ctext = cols[ans_col_id] col_choices = cols_choices[ans_col_id] if not ctext: results[rtext] = clean( col_choices[ans_choice_id] ) # not a list unlike below ..append and = [] continue ans = clean(col_choices[ans_choice_id]) # { clean(ctext) : clean(col_choices[ans_choice_id]) } if results.get(rtext): results[rtext].append(ans) else: results[rtext] = [ans] rr = {} for k, v in results.items(): if len(v) == 1: rr[k] = v[0] else: rr[k] = v return rr
def action_valid(self, action, response_text): if not util.action_recognized(action, response_text): return 0. label, proba = self.model.predict(util.clean(response_text)) p_valid = 0 if label[0] == '__label__invalid': p_valid = 1-proba[0] elif label[0] == '__label__valid': p_valid = proba[0] else: assert False, "Unrecognized Label {}".format(label[0]) # gv.dbg("[LVD]({}) {} p_Valid={:.2f}".format(action, response_text, p_valid)) return p_valid
def check_mail(r): messages = r.get_unread() for message in messages: action = "None" desc = "None" message.mark_as_read() if util.clean(message.subject) == 'flair request': # parse flair request request = parse_request(message.body) else: # send confused reply r.send_message(message.author, 'Request not recognized', 'A message from ' + message.author + ' was received, but it was not a valid request. If you were trying to send a request please check your subject formatting and try again.')
def to_string(self, prefix=''): s = prefix + "Entity: {}".format(self.name) if self._action_records: for action, (p_valid, resp) in self._action_records.items(): if p_valid > .5: s += "\n {}Action record: {} {} - {} (p={:.2f})".format( prefix, action, self.name, util.clean(resp)[:80], p_valid) for entity in self._entities: s += "\n" + prefix + entity.to_string(prefix + " ") if self._attributes: s += "\n " + prefix + "Attributes: " for attribute in self._attributes: s += attribute.to_string() + " " return s
def get_text_qna_mcq(schema_question, data): sch_anss = schema_question['answers'] chosens = [] ans_ch_ids = [answer.get('choice_id') for answer in data['answers']] if ans_ch_ids: chosens = list(get_text_by_idlist(ans_ch_ids, sch_anss['choices'])) if sch_anss.get('other'): anstext = [ clean(answer['text']) for answer in data['answers'] if 'other_id' in answer ] if anstext: chosens.append(anstext[0]) return chosens
def tokenise(self): train_data = [] train_label = [] for i in range(self.Td.shape[0]): tweet = self.Td['OriginalTweet'].iloc[i] sent = self.Td['Sentiment'].iloc[i] tweet = util.unicodeToAscii(tweet) tweet = util.clean(tweet) tweet.replace('.', ' ') train_t = [ word for word in tweet.split(' ') if word != ' ' or word != '.' ] train_d = [w for w in train_t if w != ''] if (len(train_d) > 0): train_a = np.ones(128) for j in range(len(train_d)): train_a[j] = self.data.FindTestWordToIndex(train_d[j]) train_data.append(train_a) train_label.append(self.Label.WordToIndex[sent]) return (np.array(train_data, dtype=np.long), np.array(train_label, dtype=np.long))
def execute(args): if len(args) < 1: usage() sys.exit() # Parse data # names == feature labels # x == features that correspond to shuffled names # y == shuffled names names, y, x = parse(args[0]) x = util.clean(names, x) # Runs a multi-class classification using Random Forest. # The number of possible class predictions = number of users. print "Running full multi-class classification:" print "Number of users: %d" % (len(set(y))) print x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TRAIN_PARTITION, random_state=0) # Set up Random Forest Classifier model = RandomForestClassifier( n_estimators=FOREST_SIZE, criterion=CRITERION, max_features=MAX_FEATURES, verbose=VERBOSE, ) model.fit(x_train, y_train) # Evaluation evaluate(model, x_test, y_test)
def submit_blast(): bundles = get_bundles("old") app.logger.info(pformat(request.form)) form = BlastForm(request.form) email_is_valid = validate_email(request.form["stripeEmail"]) if email_is_valid: customer = stripe.Customer.create(email=request.form["stripeEmail"], card=request.form["stripeToken"]) app.logger.info(f"Customer id: {customer.id}") else: message = "There was an issue saving your email address." return render_template("error.html", message=message, bundles=bundles) if form.validate(): app.logger.info("----Adding Blast subscription...") add_blast_subscription.delay(customer=customer, form=clean(request.form)) return render_template("blast-charge.html", bundles=bundles) else: app.logger.error("Failed to validate form") message = "There was an issue saving your donation information." return render_template("error.html", message=message, bundles=bundles)
def execute(args): np.random.seed(42) if len(args) < 1: usage() sys.exit() names, y, x = parse(args[0]) indices = [int(i) for i in args[1:]] relevant_names = names[1:] x = clean(relevant_names, x) if len(indices) > 0: x = np.asarray([[sample[i] for i in indices] for sample in x]) relevant_names = [relevant_names[i] for i in indices] print "Clustering on", str(relevant_names) + "..." labels = np.unique(y) af = AffinityPropagation(damping=0.52) x_train = random_selection(x, int(len(x) * 0.6)) af.fit(x_train) y_pred = af.predict(x) un = np.unique(y_pred) counts = get_cluster_counts(y, y_pred) totals = {} print counts for i, mapping in counts.iteritems(): s = sum(mapping.values()) if s != 0: totals[i] = sum(mapping.values()) finals = get_final_mapping(counts, totals) if len(finals) < len(labels): print "WARNING: Not all labels accounted for!" print "FINAL CLUSTERS", finals print "NUM CLUSTERS", len(counts) print "NUM Y_PRED", len([y for y in y_pred if type(y) is not np.ndarray]) print print "ACCURACY", accuracy(finals, labels) return accuracy(finals, labels), len(counts)
def unsubscribe(falcon_client, mail_processed): should_unsub, unsub_val = is_newsletter(mail_processed) if should_unsub: subject = clean(mail_processed['Subject']) unsub_list = mail_processed['Unsubscribe'] unsub_list = filter(lambda y: y.startswith('mailto:'), [x.strip()[1:-1] for x in unsub_list.split(', ')]) unsub_list = list(unsub_list) unsub_mail = None unsub_subject = 'Unsubscribe' if len(unsub_list) > 0: unsub_mail = unsub_list[0].replace('mailto:', '') unsub_subject_idx = unsub_mail.find('?subject=') if unsub_subject_idx > -1: unsub_subject = unsub_mail[unsub_subject_idx:].replace('?subject=', '') unsub_mail = unsub_mail[:unsub_subject_idx] tag = 'Unsubscribing from email list: ' if unsub_mail is None: tag = 'Cannot unsub, moving to trash: ' print( tag, subject, unsub_mail, unsub_subject, sep='\n', end='\n------------------\n' ) if unsub_mail is not None: try: falcon_client.gmail.send_to_unsubscribe(unsub_mail, unsub_subject) except Exception as exp: print('Failed to unsub.', exp)
def execute(args): print 'Starting the artificial neural network' if len(args) < 2: usage() sys.exit() ############################################################################### # Data # names feature labels # y shuffled names # x features that correspond to shuffled names names, y, x = parse(args[1]) x = clean(names, x) usePowerset = args[0] # Build features to include in test features = args[2:] if len(features) == 0: features = names # print 'Selected features:', features # Build all subsets of features, if requested if usePowerset.lower() == 'true': combos = powerset(features) else: combos = [features] # map from feature set, to map of correct counts for each person feature_performance = {} highest_correct = 0 best_combo = {} for c in combos: if len(c) == 0: continue print 'Attempting feature set:', c x_selected = selectFeatures(copy.copy(names), c, x) # Split into testing and traiing data x_train, x_test, y_train, y_test = train_test_split(x_selected, y, test_size=0.2, random_state=0) ############################################################################### # Models logistic = linear_model.LogisticRegression(C=L_REGULARIZATION) rbm = BernoulliRBM(random_state=0, verbose=True, learning_rate=N_LEARNING_RATE, n_iter=N_ITER, n_components=N_COMPONENTS) # Note: attempted StandardScaler, MinMaxScaler, MaxAbsScaler, without strong results # Not needed, since data is scaled to the [0-1] range by clean() classifier = Pipeline(steps=[('rbm', rbm),('logistic', logistic)]) # ############################################################################### # Training print 'Training the classifier...' # Training RBM-Logistic Pipeline classifier.fit(x_train,y_train) correct = 0 label_counts = defaultdict(int) for i in range(len(x_test)): test = x_test[i] if len(test) == 1: test = test.reshape(-1, 1) else: test = [test] predicted = classifier.predict(test) if predicted == y_test[i]: correct += 1 label_counts[predicted[0]] += 1 if correct >= highest_correct: highest_correct = correct best_combo = c feature_performance[str(c)] = {'predictions':label_counts,'expected':Counter(y_test)} ############################################################################### # Evaluation # evaluate(classifier, x_test, y_test) summary = feature_performance[str(best_combo)] print 'Accuracy:\t\t\t', highest_correct, 'correct gives', (highest_correct * 1.0/len(y_test)), 'compared to guessing', (1.0/len(summary['expected'])) print 'Best feature set:\t\t', best_combo print 'Identified %d out of %d labels'%(len(summary['predictions']),len(summary['expected'])) for p in summary['predictions']: pred = summary['predictions'][p] tot = summary['expected'][p] print '\t %s \t\t %d\t of %d \t (%f)'%(p, pred, tot, pred * 1.0/tot)
def execute(args): print 'Starting the gridsearch on the artificial neural network' if len(args) < 1: usage() sys.exit() ############################################################################### # Data # names feature labels # y shuffled names # x features that correspond to shuffled names names, y, x = parse(args[0]) x = clean(names, x) # Build features to include in test features = args[1:] if len(features) == 0: features = names print 'Selected features:', features x = selectFeatures(names, features, x) # Split into testing and traiing data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) ############################################################################### # Models logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm),('logistic', logistic)]) # ############################################################################### # Training print 'Training the classifier...' # Training RBM-Logistic Pipeline # classifier.fit(x_train, y_train) # Training Logistic regression # logistic_classifier = LogisticRegression(C=100.0) # logistic_classifier.fit(x_train, y_train) # evaluate(classifier, logistic_classifier, x_test, y_test) ############################################################################### # Evaluation scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(classifier, param_grid=param_grid, cv=3, scoring='%s_weighted' % score) clf.fit(x_train, y_train) print("The best parameters are %s with a score of %0.2f" % (clf.best_params_, clf.best_score_)) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, clf.predict(x_test) print(classification_report(y_true, y_pred)) print()
usage() args = sys.argv[1:] opts = "ipcsh" long_opts = "index publish clean sync help".split() try: opts, args = getopt.getopt(sys.argv[1:], opts, long_opts) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) for o, a in opts: if o in ("-h", "--help"): usage() sys.exit() elif o in ("-i", "--index"): indexify() elif o in ("-p", "--publish"): publish() elif o in ("-c", "--clean"): util.clean() elif o in ("-s", "--sync"): sync() else: assert False, "unhandled option" if __name__ == "__main__": main()
def compact(number): """Convert the number to the minimal representation. This strips the number of any valid separators and removes surrounding whitespace.""" return clean(number, ' -').upper().strip()
password = '******' irw = iRWebStats() irw.login(user, password) if not irw.logged: print ( "Couldn't log in to iRacing Membersite. Please check your credentials") exit() # Cars driven by user r = irw.cars_driven() # Returns cars id print("\n--> 1. Cars driven by custid:%s \n" % (irw.custid)) print("\n".join([irw.CARS[c]['name'] for c in r])) # Career stats r = irw.career_stats() print("\n--> 2. Career stats for custid:%s \n" % (irw.custid)) print(("Starts: %s, Wins: %s, Top 5: %s, Total Laps: %s," + " Laps Led: %s") % (r['starts'], r['wins'], r['top5'], r['totalLaps'], r['lapsLed'])) # Driver search print("\n--> 3. Driver search (Road racers with Average finish from 1 to 3)\n") drivers, total_drv = irw.driver_search( race_type=ct.RACE_TYPE_ROAD, avg_finish=(1, 3), active=True, page=1) print("Total drivers found: %s. Showing the first %s" % (total_drv, len(drivers))) print("\n".join(["%s - %s: %s" % (i + 1, clean(x['displayname']), x['irating']) for i, x in enumerate(drivers)]))
def rename(self): # length of id + ".mp3" # example: -CrTMCLxkONk.mp3 yt_id_len = 16 for iter_file in os.listdir(self.directory): if iter_file.lower().endswith(".%s" % self.kwargs["filetype"]): if self.kwargs["omit_youtube_id"]: filename = iter_file else: filename = "%s" % iter_file[:-yt_id_len] filename = re.sub(' +', ' ', filename.strip()) try: filename = unicode(filename, "utf-8") except UnicodeDecodeError as e: print "Error in file: %s: %s" % (filename, e) return if self.kwargs["omit_youtube_id"]: filename = string.capwords(filename) # write filetype lowercase filename = filename.replace(self.kwargs["filetype"].title(), self.kwargs["filetype"].lower()) else: filename = "%s.mp3" % string.capwords(filename) filename = " ".join(filename.split()) name_str = "old> %s \nnew> %s" % (iter_file, filename.encode('utf-8')) processing = True while processing: if not self.kwargs["pretend"]: print name_str print "Is this ok? (y)es / (q)uit / (s)kip / <filename> (with or w/o '.%s')" % (self.kwargs["filetype"]) if self.kwargs["lazy"] or self.kwargs["pretend"]: var = "y" else: var = sys.stdin.readline().strip() if var == "": continue source = "%s%s%s" % (self.directory, os.sep, iter_file) target = "%s%s%s" % (self.output_folder, os.sep, filename.encode('utf-8')) if var == "y": if self.kwargs["pretend"]: print prettify(source, target) else: os.rename(source, target) processing = False elif var == "q": util.clean(self.output_folder) exit() elif var == "s": target = "%s%s%s" % (self.output_folder, os.sep, iter_file) os.rename(source, target) processing = False else: var = self.strip_trailing_file_type(var) # Strip trailing file type target = "%s%s%s.%s" % (self.output_folder, os.sep, var, self.kwargs["filetype"]) os.rename(source, target) processing = False
help="number of most frequently used words to compute. " + "defaults to 4.", type=nat, default=4) args = parser.parse_args() txtsrc = opentext (args.pdf , args.gutenberg , args.filename) # traverse the whole file, adding canonical forms of valid words into a # dictionary counting the number of appearances. d = dict() for line in txtsrc: # get rid of ASCII em and en dashes line = (line.replace("---", " ")).replace("--", " ") for word in line.split(): clean_word = clean(word) if clean_word == None: #ignore words that don't parse continue else: # add or update words that do parse incr(clean_word,d) # if we're not reading from a PDF, we have to close the file handle once # we're done counting all the words. the other three settings close # themselves. if not (args.pdf or args.gutenberg): txtsrc.close() # abort if the query makes no sense. note that we can't check this until we # build the dictionary: it depends on the number of unique words.
#!/usr/bin/env python from util import make_directories, clean import scripts import os.path if __name__ == "__main__": make_directories() scripts.network_2() clean() make_directories() scripts.network_3() clean() scripts.crypto_1() scripts.crypto_2()