def build_data(self, classes, data_path):
        self.classes = sorted(list(map(lambda l: l.lower(), classes)))

        data = pd.read_csv(data_path).replace({np.nan: None})

        items = []
        for i, row in data.iterrows():
            label = row['Type'].lower()
            if label not in self.classes:
                continue

            sender = self.__get_sender_str(row['Sender'])
            subject = util.clean(row['Subject'])
            try:
                text = util.clean(row['Text'])
            except:
                print(row)
                raise Exception
            unsubscribe = self.__get_unsubscribe_str(row['Unsubscribe'])
            extensions = self.__get_extensions_str(row['Files'])

            items.append({
                'sender': sender,
                'subject': subject,
                'text': text,
                'unsubscribe': unsubscribe,
                'extensions': extensions,
                'type': label
            })

        self.train_data = pd.DataFrame(items)
        self.train_data.to_csv(os.path.join(os.path.dirname(data_path),
                                            'train.csv'),
                               index=False)
Exemple #2
0
 def take_control(self):
     """
     1) Detect candidate Entities from current location.
     2) Examine entities to get detailed descriptions
     3) Extract nested entities from detailed descriptions
     """
     obs = yield
     curr_loc = kg.player_location
     undescribed_entities = self.get_descriptionless_entities()
     if undescribed_entities:
         entity = undescribed_entities[0]
         action = gv.Examine(entity.name)
         response = yield action
         entity.description = response
         p_valid = self._valid_detector.action_valid(action, response)
         dbg("[EXM] p={:.2f} {} --> {}".format(p_valid, action,
                                               clean(response)))
         curr_loc.add_action_record(action, 1., response)
     else:
         entity_name = self._to_examine[curr_loc].pop()
         action = gv.Examine(entity_name)
         response = yield action
         p_valid = self._valid_detector.action_valid(
             action, first_sentence(response))
         success = (p_valid > self._validation_threshold)
         self.record(success)
         dbg("[EXM]({}) p={:.2f} {} --> {}".format(
             "val" if success else "inv", p_valid, action, clean(response)))
         curr_loc.add_action_record(action, p_valid, response)
         if success:
             entity = curr_loc.get_entity_by_description(response)
             if entity is None:
                 entity = Entity(entity_name,
                                 curr_loc,
                                 description=response)
                 # TODO: incorrect for entities discovered inside other entities
                 curr_loc.add_entity(entity)
             else:
                 dbg("[EXM](val) Discovered alternate name "\
                     "\'{}\' for \'{}\'".format(entity_name, entity.name))
                 entity.add_name(entity_name)
         if success:
             entity = curr_loc.get_entity_by_description(response)
             inv_entity = kg.inventory.get_entity_by_description(response)
             if entity is None and inv_entity is None:
                 entity = Entity(entity_name,
                                 curr_loc,
                                 description=response)
                 # TODO: incorrect for entities discovered inside other entities
                 curr_loc.add_entity(entity)
             else:
                 if entity:
                     dbg("[EXM](val) Discovered alternate name " \
                         "\'{}\' for \'{}\'".format(entity_name, entity.name))
                     entity.add_name(entity_name)
                 if inv_entity:
                     dbg("[EXM](val) Discovered alternate name " \
                         "\'{}\' for inventory item \'{}\'".format(entity_name, inv_entity.name))
                     inv_entity.add_name(entity_name)
Exemple #3
0
def build_text_matrix_no_cols(schema):  # answers schema
    rows = {row['id']: clean(row['text']) for row in schema['rows']}

    choices = {
        choice['id']: clean(choice['text'])
        for choice in schema['choices']
    }
    return rows, choices
Exemple #4
0
def main():
    """
    Looks for files in this format:
        <artist> - <title>.mp3
    Writes <artist> and <title> as ID3 tag in the file.
    """
    parser = argparse.ArgumentParser(description='Rename and tag audio files.')
    parser.add_argument("-o", "--omit-youtube-id", action="store_true", help="The files don't have a youtube-ID in their name.")
    parser.add_argument("-l", "--lazy", action='store_true', help="Don't prompt for every file, assume 'yes' for all.")
    parser.add_argument("-p", "--pretend", action='store_true', help="Don't do anything, just print what would be done (Assumes '-l').")
    parser.add_argument("-f", "--filetype", dest="filetype", help="FILETYPE to rename and tag (default: 'mp3').", metavar="FILETYPE", default="mp3")
    parser.add_argument("-s", "--skip-renaming", action="store_true", help="Skip renaming, just tag.")
    parser.add_argument("-a", "--album-tag", dest="albumtag", help="Write ALBUMTAG in the id3 album track tag of all files.", metavar="ALBUMTAG", default="")
    parser.add_argument("directory", help="Directory that holds the audio files.", type=str)
    args = parser.parse_args()

    output_folder = "done"
    full_output_folder = "%s%s%s" % (args.directory, os.sep, output_folder)
    util.create_output_folder(full_output_folder)
    
    if not args.skip_renaming:
        renamr = Renamr(**vars(args))
        renamr.output_folder = full_output_folder
        renamr.directory = args.directory
        renamr.rename()

    for i in os.listdir(args.directory):
        if i.endswith(".%s" % args.filetype): 
            subset = 1 + len(args.filetype)
            filename = i[:-subset]
            splitted = filename.split("-")
            
            if len(splitted) < 2:
                print "[ERROR] Could not process %s" % i
                continue

            artist = string.capwords(splitted.pop(0))
            title = string.capwords(" - ".join(splitted))

            artist = re.sub(' +', ' ', artist)
            title = re.sub(' +', ' ', title)

            full_filename = "%s%s%s" % (args.directory, os.sep, i)
            id3info = eyed3.load(full_filename)

            if id3info.tag == None:
                id3info.initTag()

            id3info.tag.title = unicode(title)
            id3info.tag.artist = unicode(artist)

            if len(args.albumtag) > 0:
                id3info.tag.album = unicode(string.capwords(args.albumtag))

            id3info.tag.save()

    util.clean(full_output_folder)
Exemple #5
0
 def getTrackFromTracklist(self, url, artist, track):
     r = requests.get(self.wwwToApi(url))
     cloudcast = json.loads(u.toUtf8(r.text))
     if ('sections' in cloudcast):
         for section in cloudcast['sections']:
             if ('track' in section):
                 tracklistArtistName = section['track']['artist']['name']
                 if (u.clean(tracklistArtistName) == u.clean(artist)):
                     trackName = self.getFullName(section['track']['artist']['name'], section['track']['name'])
                     return trackName
     return None
Exemple #6
0
async def match_label(labels_url, control_name, headers, page=1):
    "recursively search for a label that matches the control_name parameter"
    page_number = re.search(r"page=(\d+)?", labels_url).group(1)
    url = labels_url.replace(page_number, str(page))
    labels = await get_api_data(url, headers)
    for label in labels:
        if clean(label["name"]) == clean(control_name):
            return label
        # if we are at the last label on the page and no match is found, request the next page of labels
        if labels.index(label) == len(labels) - 1:
            match = await match_label(url, control_name, headers, page + 1)
            return match
    return None
Exemple #7
0
def main(argv):
    skip = []
    workdir = os.getcwd()
    proj = None
    bug = None
    onlyFailing = False
    onlyRelevant = False
    onlyTest = None
    mem = None
    try:
        opts, args = getopt.getopt(argv, "hp:b:w:s:t:m:fr")
    except getopt.GetoptError as e:
        print "Error in your arguments: ", e
        show_help()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            show_help()
            sys.exit()
        elif opt == '-p':
            proj = arg
        elif opt == '-b':
            bug = arg
        elif opt == '-t':
            onlyTest = arg
        elif opt == '-f':
            onlyFailing = True
        elif opt == '-r':
            onlyFailing = True
        elif opt == '-w':
            workdir = arg
        elif opt == '-s':
            skip = arg.split(',')
        elif opt == '-m':
            mem = arg
    if proj == None or bug == None:
        print "Please specify a project and bug number\n"
        show_help()
        sys.exit(2)

    if not 'run' in skip:
        util.clean(workdir)
        if not defects4j.from_cache(workdir, proj, bug):
            defects4j.checkout(workdir, proj, bug)
            defects4j.run(workdir, proj, bug, onlyFailing, onlyRelevant,
                          onlyTest)
            defects4j.cache(workdir, proj, bug)
    if not 'falo' in skip:
        falo.run(workdir, proj, bug, mem)
    if not 'after' in skip:
        falo.keep_interesting_graphs(workdir, proj, bug)
def run_ica_averaged(df, labels):
    # Now do ICA on the mean of the signals . Recreate the data
    df = df_post_outlier.copy()
    df = util.filter_countries_lt_n_samples(df, 5, sample_lookup)
    df = util.clean(df)
    print("ICA shape", df.shape)
    df_countries = df.join(labels)
    df_country_means = df_countries.groupby('label').mean()
    df_country_means = util.clean(df_country_means)
    M = util.build_matrix(df_country_means)
    ICA_projection = util.do_ica(M, M.shape[0])
    np.savetxt("{}/results/ICA_projection.dat".format(dir_path),
               ICA_projection)
    pd.DataFrame(df_country_means.index).to_csv(
        '{}/results/ICA_countries.csv'.format(dir_path))
Exemple #9
0
def execute(args):
  np.random.seed(42)
  if len(args) < 1:
    usage()
    sys.exit()
  names, y, x = parse(args[0])
  indices = [int(i) for i in args[1:]]
  relevant_names = names[1:]
  x = clean(relevant_names, x)
  if len(indices) > 0:
    x = [[sample[i] for i in indices] for sample in x]
    relevant_names = [relevant_names[i] for i in indices]
  print "Clustering on", str(relevant_names) + "..."

  labels = np.unique(y)
  kmeans = KMeans(n_clusters= CLUSTER_FACTOR * len(labels), random_state=0)
  y_pred = kmeans.fit_predict(x)

  counts = get_cluster_counts(y, y_pred)
  totals = [0] * len(counts)
  print counts
  for i, mapping in counts.iteritems():
    totals[i] = sum(mapping.values())
  finals = get_final_mapping(counts, totals)
  if len(finals) < len(labels):
    print "WARNING: Not all clusters unique!"
  print "FINAL CLUSTERS", finals
  print
  print "NUM LABELS", len(labels)
  print "ACCURACY", accuracy(finals, labels)
  return accuracy(finals, labels)
Exemple #10
0
def submit_blast():
    bundles = get_bundles("old")
    app.logger.info(pformat(request.form))
    form = BlastForm(request.form)

    email_is_valid = validate_email(request.form["stripeEmail"])
    amount = request.form["amount"]

    if email_is_valid:
        customer = stripe.Customer.create(email=request.form["stripeEmail"],
                                          card=request.form["stripeToken"])
        app.logger.info(f"Customer id: {customer.id}")
    else:
        message = "There was an issue saving your email address."
        return render_template("error.html", message=message, bundles=bundles)
    if form.validate():
        app.logger.info("----Adding Blast subscription...")
        add_blast_subscription.delay(customer=customer,
                                     form=clean(request.form))

        if amount == "349":
            event_label = "annual"
        elif amount == "40":
            event_label = "monthly"
        elif amount == "325":
            event_label = "annual tax exempt"

        gtm = {"event_value": amount, "event_label": event_label}

        return render_template("blast-charge.html", bundles=bundles, gtm=gtm)
    else:
        app.logger.error("Failed to validate form")
        message = "There was an issue saving your donation information."
        return render_template("error.html", message=message, bundles=bundles)
Exemple #11
0
def do_charge_or_show_errors(template, bundles, function):
    app.logger.debug("----Creating Stripe customer...")

    email = request.form["stripeEmail"]
    installment_period = request.form["installment_period"]
    amount = request.form["amount"]

    try:
        customer = stripe.Customer.create(email=email,
                                          card=request.form["stripeToken"])
    except stripe.error.CardError as e:
        body = e.json_body
        err = body.get("error", {})
        message = err.get("message", "")
        form_data = request.form.to_dict()
        del form_data["stripeToken"]

        return render_template(
            template,
            bundles=bundles,
            key=app.config["STRIPE_KEYS"]["publishable_key"],
            message=message,
            form_data=form_data,
        )
    app.logger.info(f"Customer id: {customer.id}")
    function(customer=customer, form=clean(request.form))
    gtm = {
        "event_value":
        amount,
        "event_label":
        "once" if installment_period == "None" else installment_period,
    }
    return render_template("charge.html",
                           gtm=gtm,
                           bundles=get_bundles("charge"))
Exemple #12
0
def test__clean():
    form = {
        "a": "None",
        "b": "True",
        "c": "False",
        "d": "None",
        "e": "none",
        "f": None,
        "g": True,
        "h": False,
        "i": 9,
        "j": 8.1,
        "k": "3.2",
        "l": "4",
        "m": "string",
    }
    expected = {
        "a": None,
        "b": True,
        "c": False,
        "d": None,
        "e": "none",
        "f": None,
        "g": True,
        "h": False,
        "i": 9,
        "j": 8.1,
        "k": 3.2,
        "l": 4,
        "m": "string",
    }
    actual = clean(form)
    assert expected == actual
    assert actual["bogus"] is None
Exemple #13
0
    def get_init_population(self):
        # load/create db
        fname = os.path.join(self.init_pop_dir, 'init_data.pickle')
        if os.path.isfile(fname):
            with open(fname, 'rb') as f:
                self.db = pickle.load(f)
        else:
            self.db = self.eval_core.generate_data_set(self.n_init_samples,
                                                       evaluate=True)
            with open(fname, 'wb') as f:
                pickle.dump(self.db, f)

        if len(self.db) >= self.n_init_samples:
            random.shuffle(self.db)
            self.db = self.db[:self.n_init_samples]
        else:
            raise Warning(
                'Number of init_samples is larger than the length of the '
                'initial data base, using the len(db) instead of n_init_samples'
            )

        self.db = clean(self.db, self.eval_core)
        self.db = relable(self.db, self.eval_core)
        self.db = sorted(self.db, key=lambda x: x.cost)
        # HACK for paper
        # self.db = self.db[1:]
        self.logger.log_text("[INFO] Best cost in init_pop = {}".format(
            self.db[0].cost))
Exemple #14
0
def main():
    parser = OptionParser(usage="usage: %prog [--input tweets.csv] [--output tweets_clean.txt] [--column tweet]",
                          version="%prog 1.0")
    parser.add_option("-i", "--input", dest="input", default="tweets.csv",
	    help="the input CSV file",)
    parser.add_option("-c", "--column", dest="column", default="tweet",
	    help="the CSV column to be cleaned and saved",)
    parser.add_option("-o", "--output", dest="output", default="tweets_clean.txt",
	    help="the output text file to be saved",)
    (options, args) = parser.parse_args()

    with open(options.output, "w") as my_output_file:

        with open(options.input, "r") as my_input_file:
            
            # one line method
            #[my_output_file.write("".join(util.clean(row[options.column]))+'\n') for row in csv.DictReader(my_input_file)]

            # with progress bar
            size = sum(1 for row in csv.DictReader(my_input_file))
            my_input_file.seek(0)
            reader = csv.DictReader(my_input_file)
            bar = Bar('Processing', max=size)
            for row in reader:
                my_output_file.write("".join(util.clean(row[options.column]))+'\n')
                bar.next()

            bar.finish()
        my_output_file.close()
Exemple #15
0
def add_blast_subscription(form=None, customer=None):
    """
    Adds a Blast subscription. Blast subscriptions are always recurring. They have two
    email addresses: one for billing and one for the newsletter subscription.

    """

    form = clean(form)

    first_name = form["first_name"]
    last_name = form["last_name"]
    email = form["subscriber_email"]

    logging.info("----Getting contact...")
    contact = Contact.get_or_create(email=email,
                                    first_name=first_name,
                                    last_name=last_name)
    logging.info(contact)

    rdo = RDO(contact=contact)

    rdo.stripe_customer = customer["id"]
    rdo.campaign_id = form["campaign_id"]
    rdo.referral_id = form["referral_id"]
    rdo.lead_source = "Stripe"
    rdo.amount = form.get("amount", 0)
    rdo.agreed_to_pay_fees = form["pay_fees_value"]

    # Blast specific:
    rdo.installments = 0
    rdo.description = "Blast Subscription"
    rdo.open_ended_status = "Open"
    if int(float(rdo.amount)) == 40:
        rdo.installment_period = "monthly"
    else:
        rdo.installment_period = "yearly"
    now = datetime.now(tz=ZONE).strftime("%Y-%m-%d %I:%M:%S %p %Z")
    rdo.name = f"{first_name} {last_name} - {now} - The Blast"
    rdo.type = "The Blast"
    rdo.billing_email = form["stripeEmail"]
    rdo.blast_subscription_email = form["subscriber_email"]

    logging.info("----Saving RDO....")
    apply_card_details(rdo=rdo, customer=customer)
    rdo.save()
    logging.info(rdo)
    # get opportunities
    opportunities = rdo.opportunities()
    today = datetime.now(tz=ZONE).strftime("%Y-%m-%d")
    opp = [
        opportunity for opportunity in opportunities
        if opportunity.expected_giving_date == today
    ][0]
    try:
        charge(opp)
    except ChargeException:
        # TODO should we alert slack? Did not because we had no notifications here before.
        pass

    return True
def execute(args):
  if len(args) < 1:
    usage()
    sys.exit()

  # Parse data
  #   names == feature labels
  #   x     == features that correspond to shuffled names
  #   y     == shuffled names
  names, y, x = parse(args[0])
  x = util.clean(names, x)

  # Runs RFC every combination of pairs of users for a binary classification.
  # The number of possible class predictions = 2.

  num_users = len(set(y))
  num_combos = np.math.factorial(num_users) / (2 * np.math.factorial(num_users - 2))

  print "Testing 2-way combinations of users for binary classification:"
  print "Number of users: %d" % (num_users)
  print "Number of combinations: %d" % (num_combos)
  print
  print "================================================================================"
  print "Evaluating Combinations of Users:"
  print 

  combos = get_binary_user_combinations(y)
  COMBO_SCORES = {}
  for c in combos:
    x_pruned, y_pruned = prune_data(x, y, c)

    x_train, x_test, y_train, y_test = train_test_split(x_pruned, y_pruned,
                                                  test_size=TRAIN_PARTITION,
                                                  random_state=0)


    # Set up Random Forest Classifier
    model = RandomForestClassifier(
            n_estimators=FOREST_SIZE, 
            criterion=CRITERION, 
            max_features=MAX_FEATURES,
            verbose=VERBOSE,
      )

    model.fit(x_train, y_train)

    # Updating combination scores
    if c[0] not in COMBO_SCORES:
      COMBO_SCORES[c[0]] = {}
    if c[1] not in COMBO_SCORES:
      COMBO_SCORES[c[1]] = {}

    score = average_score_k_trials(model, x_test, y_test, 5)
    COMBO_SCORES[c[0]][c[1]] = score
    COMBO_SCORES[c[1]][c[0]] = score

    print "\tEvaluating users: %35s       %f" % (c, score)
  
  # Evaluate final results
  evaluate(COMBO_SCORES, 2)
Exemple #17
0
def execute(args):
    np.random.seed(42)
    if len(args) < 1:
        usage()
        sys.exit()
    names, y, x = parse(args[0])
    indices = [int(i) for i in args[1:]]
    relevant_names = names[1:]
    x = clean(relevant_names, x)
    if len(indices) > 0:
        x = [[sample[i] for i in indices] for sample in x]
        relevant_names = [relevant_names[i] for i in indices]
    print "Clustering on", str(relevant_names) + "..."

    labels = np.unique(y)
    kmeans = KMeans(n_clusters=CLUSTER_FACTOR * len(labels), random_state=0)
    y_pred = kmeans.fit_predict(x)

    counts = get_cluster_counts(y, y_pred)
    totals = [0] * len(counts)
    print counts
    for i, mapping in counts.iteritems():
        totals[i] = sum(mapping.values())
    finals = get_final_mapping(counts, totals)
    if len(finals) < len(labels):
        print "WARNING: Not all clusters unique!"
    print "FINAL CLUSTERS", finals
    print
    print "NUM LABELS", len(labels)
    print "ACCURACY", accuracy(finals, labels)
    return accuracy(finals, labels)
Exemple #18
0
 def analyze(self, n, source):
     queue = []
     for line in self.file:
         self.numlines += 1
         line = util.clean(line)
         words = queue
         #Assumes each line in the corpus is a separate 'sentence'
         for _ in range(n):
             words.append('-BEGIN-')
         words.extend(line.split()) # current words to be considered
         if line.split():
             self.begin_map[line.split()[0]] += 1
         if ((source == "rap") and ((line == "") or (string.find(line, "verse") != -1) or (string.find(line, "hook") != -1) or (string.find(line, "bridge") != -1))):
             words = []
         queue = [] # reset queue upon reading new line
         while (len(words) > n):
             key = []
             for i in range(n):
                 key.append(words[i])
             k = tuple(key)
             self.frequency_map[k] += 1
             if k not in self.word_map:
                 self.word_map[k] = Counter({words[i + 1]:1})
             else:
                 self.word_map[k].update({words[i + 1]: 1})
             words.pop(0)
         [queue.append(word) for word in words] #add leftover words to queue
Exemple #19
0
def execute(args):
    if len(args) < 1:
        usage()
        sys.exit()

    # Parse data
    #   names == feature labels
    #   x     == features that correspond to shuffled names
    #   y     == shuffled names
    names, y, x = parse(args[0])
    x = util.clean(names, x)

    # Runs a multi-class classification using Random Forest.
    # The number of possible class predictions = number of users.

    print "Running full multi-class classification:"
    print "Number of users: %d" % (len(set(y)))
    print

    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=TRAIN_PARTITION, random_state=0)

    # Set up Random Forest Classifier
    model = RandomForestClassifier(
        n_estimators=FOREST_SIZE,
        criterion=CRITERION,
        max_features=MAX_FEATURES,
        verbose=VERBOSE,
    )

    model.fit(x_train, y_train)

    # Evaluation
    evaluate(model, x_test, y_test)
Exemple #20
0
 def __init__(self, obs, action, score, new_obs, terminal):
     message = '\"{}\" --> {} Score={}'.format(action, util.clean(new_obs), score)
     super().__init__(message)
     self.obs      = obs
     self.action   = action
     self.score    = score
     self.new_obs  = new_obs
     self.terminal = terminal
    def predict(self, unsubscribe, sender, subject, text, files):

        item = {
            'unsubscribe': self.__get_unsubscribe_str(unsubscribe),
            'sender': self.__get_sender_str(sender),
            'subject': util.clean(subject) if subject is not None else '',
            'text': util.clean(text) if text is not None else '',
            'extensions': self.__get_extensions_str(files)
        }

        x = ''
        for col in self.features_ordered:
            if item[col] is not None and len(item[col]) > 0:
                x += str(item[col]).lower() + ' '
        x = x.strip()

        predictions = self.model.predict(np.array([x]))[0]
        return self.classes[np.argmax(predictions)], predictions, x
def run_ica_not_averaged(df, labels):
    # Do ICA not averaged
    df = df_post_outlier.copy()
    df = util.clean(df)
    M = util.build_matrix(df)
    ICA_projection = util.do_ica(M, 30)
    np.savetxt("{}/results/all_samples_ICA_projection.dat".format(dir_path),
               ICA_projection)
    countries = labels.loc[df.index]['label']
    pd.DataFrame(countries).to_csv(
        '{}/results/all_samples_ICA_countries.csv'.format(dir_path))
Exemple #23
0
 def process_event(self, event):
     """ Process an event from the event stream. """
     location, message = self.get_event_info(event)
     if location not in self._to_examine:
         self._to_examine[location] = []
     if not message:
         return
     candidate_entities = self.detect_entities(message)
     dbg("[EXM](detect) {} --> {}".format(clean(message),
                                          candidate_entities))
     self.filter(candidate_entities)
Exemple #24
0
def process_question(schema_question, question):
    question_text = clean(schema_question['headings'][0]['heading'])
    results = None
    txt_replace_fn = qtype_handlers.get(schema_question['family'])
    if txt_replace_fn:
        results = {question_text: txt_replace_fn(schema_question, question)}
        #print (results)
    else:
        print(f"\n\n...........family : {schema_question['family']}\n\n")

    return results
Exemple #25
0
 def observe(self, obs, action, score, new_obs, terminal):
     """ Observe will be used for learning from rewards. """
     p_valid = self._valid_detector.action_valid(action, new_obs)
     dbg("[VALID] p={:.3f} {}".format(p_valid, clean(new_obs)))
     if kg.player_location:
         dbg("[EAGERNESS] {}".format(' '.join(
             [str(module.get_eagerness()) for module in self.modules[:5]])))
     event_stream.push(
         NewTransitionEvent(obs, action, score, new_obs, terminal))
     action_recognized(action, new_obs)  # Update the unrecognized words
     if terminal:
         kg.reset()
Exemple #26
0
 def getNeighboursFromTracklist(self, url, artist, track):
     r = requests.get(self.wwwToApi(url))
     cloudcast = json.loads(u.toUtf8(r.text))
     result = []
     if ('sections' in cloudcast):
         sections = cloudcast['sections']
         for (i, section) in enumerate(sections):
             if ('track' in section):
                 tracklistArtistName = section['track']['artist']['name']
                 tracklistTrackName = section['track']['name']
                 if (u.clean(tracklistArtistName).find(u.clean(artist)) >= 0 and u.clean(tracklistTrackName).find(u.clean(track)) >= 0):
                     if (i > 0):
                         prev = sections[i - 1]
                         name = u.toName(prev['track']['artist']['name'], prev['track']['name'])
                         result.append(name)
                     if (i < len(sections) - 1):
                         nxt = sections[i + 1]
                         name = u.toName(nxt['track']['artist']['name'], nxt['track']['name'])
                         result.append(name)
                     break
     return result
Exemple #27
0
def process_matrix_with_cols(matrix_schema, data):
    rows, cols, cols_choices = build_text_matrix_with_cols(
        matrix_schema['answers'])
    results = {}  #{'rows': rows.values(), 'cols': cols.values()}

    for answer in data['answers']:
        row_id = answer.get('row_id')
        if not row_id and answer.get('other_id'):
            results['other'] = answer['text']
            continue

        rtext = rows[row_id]
        ans_col_id = answer['col_id']
        ans_choice_id = answer['choice_id']

        ctext = cols[ans_col_id]
        col_choices = cols_choices[ans_col_id]
        if not ctext:
            results[rtext] = clean(
                col_choices[ans_choice_id]
            )  # not a list unlike below ..append and = []
            continue

        ans = clean(col_choices[ans_choice_id])
        # { clean(ctext) : clean(col_choices[ans_choice_id]) }

        if results.get(rtext):
            results[rtext].append(ans)
        else:
            results[rtext] = [ans]

    rr = {}
    for k, v in results.items():
        if len(v) == 1:
            rr[k] = v[0]
        else:
            rr[k] = v

    return rr
 def action_valid(self, action, response_text):
     if not util.action_recognized(action, response_text):
         return 0.
     label, proba = self.model.predict(util.clean(response_text))
     p_valid = 0
     if label[0] == '__label__invalid':
         p_valid = 1-proba[0]
     elif label[0] == '__label__valid':
         p_valid = proba[0]
     else:
         assert False, "Unrecognized Label {}".format(label[0])
     # gv.dbg("[LVD]({}) {} p_Valid={:.2f}".format(action, response_text, p_valid))
     return p_valid
Exemple #29
0
def check_mail(r):
    messages = r.get_unread()
    for message in messages:
        action = "None"
        desc = "None"
        message.mark_as_read()
        if util.clean(message.subject) == 'flair request':
            # parse flair request
            request = parse_request(message.body)

        else:
            # send confused reply
            r.send_message(message.author, 'Request not recognized',
            'A message from ' + message.author + ' was received, but it was not a valid request. If you were trying to send a request please check your subject formatting and try again.')
Exemple #30
0
 def to_string(self, prefix=''):
     s = prefix + "Entity: {}".format(self.name)
     if self._action_records:
         for action, (p_valid, resp) in self._action_records.items():
             if p_valid > .5:
                 s += "\n  {}Action record: {} {} - {} (p={:.2f})".format(
                     prefix, action, self.name,
                     util.clean(resp)[:80], p_valid)
     for entity in self._entities:
         s += "\n" + prefix + entity.to_string(prefix + "  ")
     if self._attributes:
         s += "\n  " + prefix + "Attributes: "
         for attribute in self._attributes:
             s += attribute.to_string() + " "
     return s
Exemple #31
0
def get_text_qna_mcq(schema_question, data):
    sch_anss = schema_question['answers']
    chosens = []
    ans_ch_ids = [answer.get('choice_id') for answer in data['answers']]
    if ans_ch_ids:
        chosens = list(get_text_by_idlist(ans_ch_ids, sch_anss['choices']))

    if sch_anss.get('other'):
        anstext = [
            clean(answer['text']) for answer in data['answers']
            if 'other_id' in answer
        ]
        if anstext:
            chosens.append(anstext[0])

    return chosens
Exemple #32
0
 def tokenise(self):
     train_data = []
     train_label = []
     for i in range(self.Td.shape[0]):
         tweet = self.Td['OriginalTweet'].iloc[i]
         sent = self.Td['Sentiment'].iloc[i]
         tweet = util.unicodeToAscii(tweet)
         tweet = util.clean(tweet)
         tweet.replace('.', ' ')
         train_t = [
             word for word in tweet.split(' ') if word != ' ' or word != '.'
         ]
         train_d = [w for w in train_t if w != '']
         if (len(train_d) > 0):
             train_a = np.ones(128)
             for j in range(len(train_d)):
                 train_a[j] = self.data.FindTestWordToIndex(train_d[j])
             train_data.append(train_a)
             train_label.append(self.Label.WordToIndex[sent])
     return (np.array(train_data,
                      dtype=np.long), np.array(train_label, dtype=np.long))
Exemple #33
0
def execute(args):
  if len(args) < 1:
    usage()
    sys.exit()

  # Parse data
  #   names == feature labels
  #   x     == features that correspond to shuffled names
  #   y     == shuffled names
  names, y, x = parse(args[0])
  x = util.clean(names, x)

  # Runs a multi-class classification using Random Forest.
  # The number of possible class predictions = number of users.

  print "Running full multi-class classification:"
  print "Number of users: %d" % (len(set(y)))
  print

  x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                  test_size=TRAIN_PARTITION,
                                                  random_state=0)

  # Set up Random Forest Classifier
  model = RandomForestClassifier(
            n_estimators=FOREST_SIZE, 
            criterion=CRITERION, 
            max_features=MAX_FEATURES,
            verbose=VERBOSE,
      )

  model.fit(x_train, y_train)

  # Evaluation
  evaluate(model, x_test, y_test)


  
Exemple #34
0
def submit_blast():
    bundles = get_bundles("old")
    app.logger.info(pformat(request.form))
    form = BlastForm(request.form)

    email_is_valid = validate_email(request.form["stripeEmail"])

    if email_is_valid:
        customer = stripe.Customer.create(email=request.form["stripeEmail"],
                                          card=request.form["stripeToken"])
        app.logger.info(f"Customer id: {customer.id}")
    else:
        message = "There was an issue saving your email address."
        return render_template("error.html", message=message, bundles=bundles)
    if form.validate():
        app.logger.info("----Adding Blast subscription...")
        add_blast_subscription.delay(customer=customer,
                                     form=clean(request.form))
        return render_template("blast-charge.html", bundles=bundles)
    else:
        app.logger.error("Failed to validate form")
        message = "There was an issue saving your donation information."
        return render_template("error.html", message=message, bundles=bundles)
def execute(args):
    np.random.seed(42)
    if len(args) < 1:
        usage()
        sys.exit()
    names, y, x = parse(args[0])
    indices = [int(i) for i in args[1:]]
    relevant_names = names[1:]
    x = clean(relevant_names, x)
    if len(indices) > 0:
        x = np.asarray([[sample[i] for i in indices] for sample in x])
        relevant_names = [relevant_names[i] for i in indices]
    print "Clustering on", str(relevant_names) + "..."

    labels = np.unique(y)
    af = AffinityPropagation(damping=0.52)
    x_train = random_selection(x, int(len(x) * 0.6))
    af.fit(x_train)
    y_pred = af.predict(x)
    un = np.unique(y_pred)

    counts = get_cluster_counts(y, y_pred)
    totals = {}
    print counts
    for i, mapping in counts.iteritems():
        s = sum(mapping.values())
        if s != 0:
            totals[i] = sum(mapping.values())
    finals = get_final_mapping(counts, totals)
    if len(finals) < len(labels):
        print "WARNING: Not all labels accounted for!"
    print "FINAL CLUSTERS", finals
    print "NUM CLUSTERS", len(counts)
    print "NUM Y_PRED", len([y for y in y_pred if type(y) is not np.ndarray])
    print
    print "ACCURACY", accuracy(finals, labels)
    return accuracy(finals, labels), len(counts)
Exemple #36
0
def unsubscribe(falcon_client, mail_processed):
    should_unsub, unsub_val = is_newsletter(mail_processed)
    if should_unsub:
        subject = clean(mail_processed['Subject'])
        unsub_list = mail_processed['Unsubscribe']

        unsub_list = filter(lambda y: y.startswith('mailto:'),
                            [x.strip()[1:-1] for x in unsub_list.split(', ')])
        unsub_list = list(unsub_list)

        unsub_mail = None
        unsub_subject = 'Unsubscribe'
        if len(unsub_list) > 0:
            unsub_mail = unsub_list[0].replace('mailto:', '')
            unsub_subject_idx = unsub_mail.find('?subject=')
            if unsub_subject_idx > -1:
                unsub_subject = unsub_mail[unsub_subject_idx:].replace('?subject=', '')
                unsub_mail = unsub_mail[:unsub_subject_idx]

        tag = 'Unsubscribing from email list: '
        if unsub_mail is None:
            tag = 'Cannot unsub, moving to trash: '

        print(
            tag,
            subject,
            unsub_mail,
            unsub_subject,
            sep='\n',
            end='\n------------------\n'
        )

        if unsub_mail is not None:
            try:
                falcon_client.gmail.send_to_unsubscribe(unsub_mail, unsub_subject)
            except Exception as exp:
                print('Failed to unsub.', exp)
Exemple #37
0
def execute(args):
  print 'Starting the artificial neural network'
  if len(args) < 2:
    usage()
    sys.exit()

  ###############################################################################
  # Data

  # names     feature labels
  # y         shuffled names
  # x         features that correspond to shuffled names
  names, y, x = parse(args[1])
  x = clean(names, x)
  usePowerset = args[0]

  # Build features to include in test
  features = args[2:]
  if len(features) == 0:
    features = names
  # print 'Selected features:', features

  # Build all subsets of features, if requested
  if usePowerset.lower() == 'true':
    combos = powerset(features)
  else:
    combos = [features]

  # map from feature set, to map of correct counts for each person
  feature_performance = {}
  highest_correct = 0
  best_combo = {}
  for c in combos:
    if len(c) == 0:
      continue
    print 'Attempting feature set:', c
    x_selected = selectFeatures(copy.copy(names), c, x)

    # Split into testing and traiing data
    x_train, x_test, y_train, y_test = train_test_split(x_selected, y,
                                                      test_size=0.2,
                                                      random_state=0)

    ###############################################################################
    # Models

    logistic = linear_model.LogisticRegression(C=L_REGULARIZATION)
    rbm = BernoulliRBM(random_state=0, verbose=True, learning_rate=N_LEARNING_RATE, n_iter=N_ITER, n_components=N_COMPONENTS)

    # Note: attempted StandardScaler, MinMaxScaler, MaxAbsScaler, without strong results
    # Not needed, since data is scaled to the [0-1] range by clean()
    classifier = Pipeline(steps=[('rbm', rbm),('logistic', logistic)])

    # ###############################################################################
    # Training
    print 'Training the classifier...'
    # Training RBM-Logistic Pipeline
    classifier.fit(x_train,y_train)
    correct = 0
    label_counts = defaultdict(int)
    for i in range(len(x_test)):
      test = x_test[i]
      if len(test) == 1:
        test = test.reshape(-1, 1)
      else:
        test = [test]
      predicted = classifier.predict(test)

      if predicted == y_test[i]:
        correct += 1
        label_counts[predicted[0]] += 1

    if correct >= highest_correct:
      highest_correct = correct
      best_combo = c
    feature_performance[str(c)] = {'predictions':label_counts,'expected':Counter(y_test)}

    ###############################################################################
    # Evaluation
    # evaluate(classifier, x_test, y_test)

  summary = feature_performance[str(best_combo)]
  print 'Accuracy:\t\t\t', highest_correct, 'correct gives', (highest_correct * 1.0/len(y_test)), 'compared to guessing', (1.0/len(summary['expected']))
  print 'Best feature set:\t\t', best_combo
  print 'Identified %d out of %d labels'%(len(summary['predictions']),len(summary['expected']))
  for p in summary['predictions']:
    pred = summary['predictions'][p]
    tot = summary['expected'][p]
    print '\t %s \t\t %d\t of %d \t (%f)'%(p, pred, tot, pred * 1.0/tot)
Exemple #38
0
def execute(args):
  print 'Starting the gridsearch on the artificial neural network'
  if len(args) < 1:
    usage()
    sys.exit()

  ###############################################################################
  # Data

  # names     feature labels
  # y         shuffled names
  # x         features that correspond to shuffled names
  names, y, x = parse(args[0])
  x = clean(names, x)

  # Build features to include in test
  features = args[1:]
  if len(features) == 0:
    features = names
  print 'Selected features:', features
  x = selectFeatures(names, features, x)

  # Split into testing and traiing data
  x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.2,
                                                    random_state=0)

  ###############################################################################
  # Models

  logistic = LogisticRegression()
  rbm = BernoulliRBM(random_state=0, verbose=True)
  classifier = Pipeline(steps=[('rbm', rbm),('logistic', logistic)])

  # ###############################################################################
  # Training
  print 'Training the classifier...'
  # Training RBM-Logistic Pipeline
  # classifier.fit(x_train, y_train)

  # Training Logistic regression
  # logistic_classifier = LogisticRegression(C=100.0)
  # logistic_classifier.fit(x_train, y_train)
  # evaluate(classifier, logistic_classifier, x_test, y_test)

  ###############################################################################
  # Evaluation

  scores = ['precision', 'recall']
  for score in scores:
      print("# Tuning hyper-parameters for %s" % score)
      print()

      clf = GridSearchCV(classifier, param_grid=param_grid, cv=3, scoring='%s_weighted' % score)
      clf.fit(x_train, y_train)

      print("The best parameters are %s with a score of %0.2f"
       % (clf.best_params_, clf.best_score_))
      print()
      print("Grid scores on development set:")
      print()
      for params, mean_score, scores in clf.grid_scores_:
          print("%0.3f (+/-%0.03f) for %r"
                % (mean_score, scores.std() * 2, params))
      print()

      print("Detailed classification report:")
      print()
      print("The model is trained on the full development set.")
      print("The scores are computed on the full evaluation set.")
      print()
      y_true, y_pred = y_test, clf.predict(x_test)
      print(classification_report(y_true, y_pred))
      print()
Exemple #39
0
        usage()
    args = sys.argv[1:]
    opts = "ipcsh"
    long_opts = "index publish clean sync help".split()
    try:
        opts, args = getopt.getopt(sys.argv[1:], opts, long_opts)
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err)  # will print something like "option -a not recognized"
        usage()
        sys.exit(2)

    for o, a in opts:
        if o in ("-h", "--help"):
            usage()
            sys.exit()
        elif o in ("-i", "--index"):
            indexify()
        elif o in ("-p", "--publish"):
            publish()
        elif o in ("-c", "--clean"):
            util.clean()
        elif o in ("-s", "--sync"):
            sync()
        else:
            assert False, "unhandled option"


if __name__ == "__main__":
    main()
Exemple #40
0
def compact(number):
    """Convert the number to the minimal representation. This strips the
    number of any valid separators and removes surrounding whitespace."""
    return clean(number, ' -').upper().strip()
Exemple #41
0
password = '******'
irw = iRWebStats()
irw.login(user, password)
if not irw.logged:
    print (
        "Couldn't log in to iRacing Membersite. Please check your credentials")
    exit()

# Cars driven by user
r = irw.cars_driven()  # Returns cars id

print("\n--> 1. Cars driven by custid:%s \n" % (irw.custid))
print("\n".join([irw.CARS[c]['name'] for c in r]))

# Career stats
r = irw.career_stats()
print("\n--> 2. Career stats for custid:%s \n" % (irw.custid))
print(("Starts: %s, Wins: %s, Top 5: %s, Total Laps: %s," +
       " Laps Led: %s") % (r['starts'], r['wins'], r['top5'], r['totalLaps'],
                           r['lapsLed']))

# Driver search
print("\n--> 3. Driver search (Road racers with Average finish from 1 to 3)\n")
drivers, total_drv = irw.driver_search(
    race_type=ct.RACE_TYPE_ROAD, avg_finish=(1, 3), active=True, page=1)

print("Total drivers found: %s. Showing the first %s" % (total_drv,
                                                         len(drivers)))
print("\n".join(["%s - %s: %s" % (i + 1, clean(x['displayname']), x['irating'])
                for i, x in enumerate(drivers)]))
Exemple #42
0
    def rename(self):
        # length of id + ".mp3"
        # example: -CrTMCLxkONk.mp3
        yt_id_len = 16
    
        for iter_file in os.listdir(self.directory):
            if iter_file.lower().endswith(".%s" % self.kwargs["filetype"]): 

                if self.kwargs["omit_youtube_id"]:
                    filename = iter_file
                else:
                    filename = "%s" % iter_file[:-yt_id_len]
    
                filename = re.sub(' +', ' ', filename.strip())
                try:
                    filename = unicode(filename, "utf-8")
                except UnicodeDecodeError as e:
                    print "Error in file: %s: %s" % (filename, e)
                    return
    
                if self.kwargs["omit_youtube_id"]:
                    filename = string.capwords(filename)
                    # write filetype lowercase
                    filename = filename.replace(self.kwargs["filetype"].title(), self.kwargs["filetype"].lower())
                else:
                    filename = "%s.mp3" % string.capwords(filename)
    
                filename = " ".join(filename.split())
                name_str = "old> %s \nnew> %s" % (iter_file, filename.encode('utf-8')) 
    
                processing = True
                while processing:
                    if not self.kwargs["pretend"]:
                        print name_str
                        print "Is this ok? (y)es / (q)uit / (s)kip / <filename> (with or w/o '.%s')" % (self.kwargs["filetype"])
    
                    if self.kwargs["lazy"] or self.kwargs["pretend"]:
                        var = "y"
                    else:
                        var = sys.stdin.readline().strip()

                    if var == "":
                        continue

                    source = "%s%s%s" % (self.directory, os.sep, iter_file)
                    target = "%s%s%s" % (self.output_folder, os.sep, filename.encode('utf-8'))

                    if var == "y":
                        if self.kwargs["pretend"]:
                            print prettify(source, target)
                        else:
                            os.rename(source, target)
                        processing = False
                    elif var == "q":
                        util.clean(self.output_folder)
                        exit()
                    elif var == "s":
                        target = "%s%s%s" % (self.output_folder, os.sep, iter_file)
                        os.rename(source, target)
                        processing = False
                    else:
                        var = self.strip_trailing_file_type(var) # Strip trailing file type
                        target = "%s%s%s.%s" % (self.output_folder, os.sep, var, self.kwargs["filetype"])
                        os.rename(source, target)
                        processing = False
Exemple #43
0
                    help="number of most frequently used words to compute. "
                          + "defaults to 4.",
                    type=nat,
                    default=4)
args = parser.parse_args()
txtsrc = opentext (args.pdf , args.gutenberg , args.filename)

# traverse the whole file, adding canonical forms of valid words into a
# dictionary counting the number of appearances.
d = dict()
for line in txtsrc:
    # get rid of ASCII em and en dashes
    line = (line.replace("---", " ")).replace("--", " ")

    for word in line.split():
        clean_word = clean(word)
        if clean_word == None:
            #ignore words that don't parse
            continue
        else:
            # add or update words that do parse
            incr(clean_word,d)

# if we're not reading from a PDF, we have to close the file handle once
# we're done counting all the words. the other three settings close
# themselves.
if not (args.pdf or args.gutenberg):
    txtsrc.close()

# abort if the query makes no sense. note that we can't check this until we
# build the dictionary: it depends on the number of unique words.
Exemple #44
0
#!/usr/bin/env python

from util import make_directories, clean
import scripts
import os.path

if __name__ == "__main__":
    make_directories()
    scripts.network_2()
    clean()
    make_directories()
    scripts.network_3()
    clean()
    scripts.crypto_1()
    scripts.crypto_2()