def RateSimilarity(self, first_word, second_word):
     first_word = lower(first_word)
     second_word = lower(second_word)
     if first_word is not None and second_word is not None:
         if first_word == second_word:
             return self.default_match_score
         else:
             half_length = min(len(first_word), len(second_word)) / 2 + 1
             common1 = self.GetCommonCharacters(first_word, second_word,
                                                half_length)
             common_matches = len(common1)
             if common_matches == 0:
                 return self.default_match_score
             common2 = self.GetCommonCharacters(second_word, first_word,
                                                half_length)
             if common_matches != len(common2):
                 return self.default_match_score
             transpositions = 0
             for i in common_matches:
                 if common1[i] != common2[i]:
                     transpositions = transpositions + 1
             transpositions = transpositions / 2
             jaro_metric = common_matches / (
                 3.0 * len(first_word)
             ) + common_matches / (3.0 * len(second_word)) + (
                 common_matches - transpositions) / (3.0 * common_matches)
             return jaro_metric
     return self.default_match_score
def getUserInputTF(prompt):
    # FUNCTION PURPOSE: Get a valid boolean (True or False) from user
    #
    # INPUTS:
    #   prompt: String that is printed to the console to prompt user input
    #
    # OUTPUTS:
    #   userInput: Boolean containing the user's answer to 'prompt'

    # Print the prompt to console, followed by the user's input options ("Y" or "N")
    print(prompt+" (Y/N)")

    # userInput starts as empty string
    userInput = ""

    # While userInput remains empty, get input
    while not userInput:
        userInput = input()
        # If input isn't either "Y" or "N", set userInput to empty string
        if lower(userInput) != "y" and lower(userInput) != "n":
            print("Please enter a valid answer (Y/N):")
            # Console output to let user know requirements
            userInput = ""

    # Now that the loop has finished, return True for "Y" and False for "N"
    if lower(userInput) == "y":
        return True
    else:
        return False
 def ComputeClosestWordsSimilarity(self, s1, s2):
     max = 0
     s1_closest_words = self.GetClosesWords(s1)
     s2_closest_wprds = self.GetClosesWords(s2)
     for ss1 in s1_closest_words:
         for ss2 in s2_closest_wprds:
             syn_similarity = self.w_2_vec_util.GetWord2VecSimilarity(
                 str(lower(ss1)), str(lower(ss2)))
             if syn_similarity > max:
                 max = syn_similarity
     return max
def getUserInputTF(prompt):
    print(prompt + " (Y/N)")  #Prompts user for a Yes or No
    userInput = ""
    while not userInput:
        userInput = input()
        if lower(userInput) != "y" and lower(userInput) != "n":
            userInput = "Please enter a 'Y' or 'N'"
    if lower(userInput) == "y":
        return True
    else:
        return
Beispiel #5
0
def getUserInputUnits():
    print("Enter the unit of time to use with this plot:")
    userInput = ""
    while not userInput:
        userInput = input()
        if lower(userInput) != "hours" and lower(
                userInput) != "days" and lower(
                    userInput) != "months" and lower(userInput) != "years":
            print("Please enter either 'hours', 'days', 'months', or 'years':")
            userInput = ""

    return lower(userInput)
Beispiel #6
0
def load_source_rows(tab, names, key='assoc'):
    """Load the rows from a table that match a source name.

    Parameters
    ----------
    tab : `astropy.table.Table`
       Table that will be searched.

    names : list
       List of source identifiers.

    key : str
       Name of the table column that will be searched for a source
       matching key.

    Returns
    -------
    outtab : `astropy.table.Table`
       Table containing the subset of rows with matching source identifiers.

    """
    names = [name.lower().replace(' ', '') for name in names]
    col = tab[[key]].copy()
    
    col[key] = defchararray.replace(defchararray.lower(col[key]),
                               ' ', '')
    mask = create_mask(col, {key: names})
    return tab[mask]
Beispiel #7
0
    def getProblemSet(self, options):
        variance = options.sample_var
        sample_amount = options.sample_amount

        ttk = self.transition_kernel

        # index 1 and 2 are kernel indices, 3 is the sample index
        mu = _np.repeat(ttk[:, :, :, _np.newaxis], sample_amount, axis=3)

        # if we use variance scaling.
        # make a kernel for each variance between upper and lower limit
        if options.variance_scaling:
            variance = _np.divide(
                range(options.variance_lower, options.sample_amount),
                options.sample_amount / options.variance_upper)

        if lower(options.sample_method) == "uniform":
            # sample from uniform
            tk_low, tk_up = Interval.compute_interval(mu, variance)
            non_normalized_tks = _np.random.uniform(tk_low, tk_up)
        elif lower(options.sample_method) == "monte carlo":
            non_normalized_tks = monte_carlo_sampling(
                self.transition_kernel, sample_amount,
                options.monte_carlo_sampling_init_count_value,
                options.monte_carlo_sampling_random_samples)
        else:
            # sample from normal
            non_normalized_tks = _np.random.normal(mu, variance)

        problems_out = []
        for i in range(sample_amount):
            tk = self.normalize_tk(non_normalized_tks[:, :, :, i])
            for a in options.non_robust_actions:
                tk[a] = self.transition_kernel[a]

            distance = 0
            for a in range(self.transition_kernel.shape[0]):
                for s in range(self.transition_kernel.shape[1]):
                    distance += wasserstein_distance(
                        tk[a][s], self.transition_kernel[a][s])
            new_problem = Problem(tk, self.reward_matrix, self.discount_factor,
                                  self.name, distance)
            # new_problem.transition_kernel = tk
            problems_out.append(new_problem)

        return ProblemSet(problems_out, self, options, Sampling.ALL)
Beispiel #8
0
def find_i_nodes(g):
    nodes = dict(g.nodes)
    i_node_list = []
    for key in nodes:
        if 'i' == lower(nodes[key]['type']):
            i_node_list.append((key, nodes[key]))

    return i_node_list
Beispiel #9
0
def dbscan(metric, eps, min_samples):
    dbscan = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
    configuration = [
        str(lower(dbscan.__class__.__qualname__)),
        str(dbscan.metric),
        str(dbscan.min_samples),
        str(dbscan.eps)
    ]
    return configuration, dbscan
Beispiel #10
0
def affinity_propagation(number_init, max_iterations, damping):
    ap = AffinityPropagation(max_iter=max_iterations,
                             convergence_iter=number_init,
                             damping=damping)
    configuration = [
        str(lower(ap.__class__.__qualname__)),
        str(ap.convergence_iter)
    ]
    return configuration, ap
def affichage():
    """
    Affichage de liste des filtres disponibles
    """
    print(f"List des filtres disponibles (Tout en minuscules)")
    with os.scandir("filters") as entries:
        for entry in entries:
            var = entry.name.split(".py")
            for a in var:
                print(lower(a))
Beispiel #12
0
def kmeans(number_clusters, number_init, max_iterations):
    kmeans = KMeans(n_clusters=number_clusters,
                    n_init=number_init,
                    max_iter=max_iterations)
    configuration = [
        str(lower(kmeans.__class__.__qualname__)),
        str(kmeans.n_clusters),
        str(kmeans.n_init)
    ]
    return configuration, kmeans
Beispiel #13
0
def agglomerative(number_clusters, affinity, linkage):
    ac = AgglomerativeClustering(n_clusters=number_clusters,
                                 affinity=affinity,
                                 linkage=linkage)
    configuration = [
        str(lower(ac.__class__.__qualname__)),
        str(ac.n_clusters),
        str(ac.affinity),
        str(ac.linkage)
    ]
    return configuration, ac
Beispiel #14
0
def gaussian_mixture(number_clusters, number_init, max_iterations):
    gm = GaussianMixture(n_components=number_clusters,
                         random_state=0,
                         n_init=number_init,
                         max_iter=max_iterations)

    configuration = [
        str(lower(gm.__class__.__qualname__)),
        str(gm.n_components),
        str(gm.n_init)
    ]
    return configuration, gm
Beispiel #15
0
def spectral_clustering(affinity, assign_labels, number_init,
                        number_neighbors):
    sc = SpectralClustering(affinity=affinity,
                            assign_labels=assign_labels,
                            n_init=number_init,
                            n_neighbors=number_neighbors)
    configuration = [
        str(lower(sc.__class__.__qualname__)),
        str(sc.n_init),
        str(sc.affinity),
        str(sc.assign_labels),
        str(sc.n_neighbors)
    ]
    return configuration, sc
Beispiel #16
0
def movie_wordcloud(df):
    title_df = df.select("id", "title")
    # Clean text
    df_clean = title_df.select(
        "id",
        lower(regexp_replace('title', "[^a-zA-Z\\s]", "")).alias('title'))

    # Tokenize text
    tokenizer = Tokenizer(inputCol='title', outputCol='words_token')
    df_words_token = tokenizer.transform(df_clean).select('id', 'words_token')

    # Remove stop words
    remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
    df_words_no_stopw = remover.transform(df_words_token).select(
        'id', 'words_clean')

    #df_words_no_stopw.show(10)

    wordsDF = df_words_no_stopw.select(explode("words_clean").alias("words"))

    wordsDF = wordsDF.select(trim(wordsDF.words).alias("words"))
    #wordsDF.show()

    wordCountDF = wordsDF.groupBy("words").count().orderBy(
        desc("count")).limit(16)
    #wordCountDF.show()
    pandD = wordCountDF.toPandas()
    pandD.drop(0, inplace=True)

    sns.barplot(y='words', x='count', data=pandD)
    plt.title("Movie Title  Analysis")
    plt.xlabel('Words Frequency')
    plt.ylabel('Words')
    #plt.show()

    wordCountDF = wordsDF.groupBy("words").count().orderBy(
        desc("count")).limit(101)
    pandD = wordCountDF.toPandas()
    pandD.drop(0, inplace=True)  # drop first row

    wordcloudConvertDF = pandD.set_index('words').T.to_dict('records')
    wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=100, relative_scaling=0.5,
                          colormap='Dark2') \
        .generate_from_frequencies(dict(*wordcloudConvertDF))
    plt.figure(figsize=(14, 10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title("Words Cloud - Movie Titles")
    plt.axis('off')
    plt.show()
    """# Overview Cloud
Beispiel #17
0
def main():
    print(color.BLUE + 'Execution Date: ' + color.END + Receipt_date)
    while True:

        req_user = lower(
            input(
                "Want to stay out of brain burns - try Chuck therapy? (y/n): ")
        )
        if req_user == 'y':
            api_call()
        elif req_user == 'n':
            print(
                "I hope you are normal now.. A Last Joke for you enjoy it... ")
            api_call()
            break
        else:
            print(color.RED + "Please use y or n" + color.END)
            continue
Beispiel #18
0
def process_message(body):
    if 'format' not in body or 'path' not in body or 'loadTo' not in body:
        print('missing one or more fields in body object')
        return False

    format = lower(body['format'])
    file = body['path']
    loadTo = body['loadTo']

    if os.path.exists(file):
        if format == 'csv':
            return load_csv_to_db(file, loadTo)
        elif format == 'json':
            return load_json_to_db(file, loadTo)
        else:
            print("error - received {0} format, invoice format must be csv or json".format(format))
            return False
    else:
        print("could not find given path")
        return False
Beispiel #19
0
def find_rows_by_string(tab, names, colnames=['assoc']):
    """Find the rows in a table ``tab`` that match at least one of the
    strings in ``names``.  This method ignores whitespace and case
    when matching strings.

    Parameters
    ----------
    tab : `astropy.table.Table`
       Table that will be searched.

    names : list
       List of strings.

    colname : str
       Name of the table column that will be searched for matching string.

    Returns
    -------
    mask : `~numpy.ndarray`
       Boolean mask for rows with matching strings.

    """
    mask = np.empty(len(tab), dtype=bool)
    mask.fill(False)
    names = [name.lower().replace(' ', '') for name in names]

    for colname in colnames:

        if colname not in tab.columns:
            continue

        col = tab[[colname]].copy()
        col[colname] = defchararray.replace(
            defchararray.lower(col[colname]).astype(str), ' ', '')
        for name in names:
            mask |= col[colname] == name
    return mask
Beispiel #20
0
def find_rows_by_string(tab, names, colnames=['assoc']):
    """Find the rows in a table ``tab`` that match at least one of the
    strings in ``names``.  This method ignores whitespace and case
    when matching strings.

    Parameters
    ----------
    tab : `astropy.table.Table`
       Table that will be searched.

    names : list
       List of strings.

    colname : str
       Name of the table column that will be searched for matching string.

    Returns
    -------
    mask : `~numpy.ndarray`
       Boolean mask for rows with matching strings.

    """
    mask = np.empty(len(tab), dtype=bool)
    mask.fill(False)
    names = [name.lower().replace(' ', '') for name in names]

    for colname in colnames:

        if colname not in tab.columns:
            continue

        col = tab[[colname]].copy()
        col[colname] = defchararray.replace(defchararray.lower(col[colname]).astype(str),
                                        ' ', '')
        for name in names:
            mask |= col[colname] == name
    return mask
def create_xml(config_df, isd, wftype):
    wf_name = "sit_aml_" + sys_name_l + "_" + country_l + "_" + wftype
    isd_tbl_list = config_df["isd"].tolist()
    for cnt, tab in enumerate(list(chunks(isd_tbl_list, SIZE))):
        if cnt == 0:
            workflow_name = wf_name + "_wf"
            workflow_file = wf_name + "_wf.xml"

            workflow_name_adhoc = wf_name + "_batch_wf"
            workflow_file_adhoc = wf_name + "_batch_wf.xml"
        else:
            workflow_name = wf_name + "_wf" + "_" + str(cnt)
            workflow_file = wf_name + "_wf" + "_" + str(cnt) + ".xml"

            workflow_name_adhoc = wf_name + "_batch_wf" + "_" + str(cnt)
            workflow_file_adhoc = wf_name + "_batch_wf" + "_" + str(
                cnt) + ".xml"

        create_file(sys_name_l, country_l, 'wf', workflow_name)
        create_file(sys_name_l, country_l, 'adhoc', workflow_name_adhoc)

        f_source_count = open(output_folder + '/sourcecount.txt', "a+")
        f_incremental = open(output_folder + '/' + workflow_file, "w+")
        f_adhoc = open(output_folder + '/' + workflow_file_adhoc, "w+")
        f_incremental.write(first + '\n')
        f_adhoc.write(first + '\n')
        for table in tab:
            table = table.strip()
            if len(table.strip()) == 0:
                print("ISD table name cannot be blank")
                os.remove('./running.script')
                sys.exit(1)

            tbl = config_df[config_df["isd"] == table]
            print(tbl)
            DATABASE = str(lower(tbl["database"].tolist()[0])).strip()
            table_name = str(lower(tbl["table_name"].tolist()[0])).strip()
            INC_WHERE = str(lower(
                tbl["inc_filtercondition"].tolist()[0])).strip()
            HIS_WHERE = str(lower(
                tbl["batch_filtercondition"].tolist()[0])).strip()
            output_file_name = '_'.join([country_u, sys_name_u, table.upper()])

            create_file(sys_name_l, country_l, 'table',
                        output_file_name.lower())
            f_source_count.write(
                "select '" + output_file_name.upper() +
                "',COUNT(1),'${businessday}' FROM ${aml_sri_open}." +
                table_name.lower() + " WHERE " + INC_WHERE.lower() +
                " UNION ALL \n ")

            print("Table Name : " + table)
            f_incremental.write(header + '\n')
            f_adhoc.write(header + '\n')

            col_list = get_ISD_Col((isd.sheet_by_name(table)))
            print("No of columns: %i\n" % len(col_list))
            all_cols = ','.join(col_list)
            for col in col_list:
                f_incremental.write(cols.replace('COLNAME', col) + '\n')
                f_adhoc.write(cols.replace('COLNAME', col) + '\n')

            inc_path_value = path.replace('user_name', USER_NAME).replace(
                'output_file_name',
                output_file_name).replace('db_name', DATABASE).replace(
                    'table_name',
                    table_name).replace('var_column_order', all_cols).replace(
                        'WHERE_CLAUSE', INC_WHERE)

            hist_path_value = path.replace('user_name', USER_NAME).replace(
                'output_file_name',
                output_file_name).replace('db_name', DATABASE).replace(
                    'table_name',
                    table_name).replace('var_column_order', all_cols).replace(
                        'WHERE_CLAUSE', HIS_WHERE)

            f_incremental.write(inc_path_value + '\n')
            f_adhoc.write(hist_path_value + '\n')


        trailer_value_inc = trailer.replace('dest_path', DEST_PATH).replace('wrk_name', workflow_name).\
            replace('db_id', DB_ID).replace('storage', STORAGE)

        trailer_value_adhoc = trailer.replace('dest_path', DEST_PATH).replace('wrk_name', workflow_name_adhoc). \
            replace('db_id', DB_ID).replace('storage', STORAGE)

        f_incremental.write(trailer_value_inc)
        f_adhoc.write(trailer_value_adhoc)
        f_incremental.close()
        f_adhoc.close()
        f_source_count.close()
Beispiel #22
0
    # - The config file options would be not allowed to contain them
    for skip_option in {'config', 'help'}:
        del args[skip_option]

    return merge_args(args, config_args)


if __name__ == '__main__':
    config = get_options()

    number_runs = int(config['number_runs'])
    verbose = bool(config['verbose'])
    classification_problem = bool(config['classification'])

    # Specify name of the dataset and percentage of the entire data volume to sample
    dataset_name = lower(config['dataset'])
    subsample_factor = float(config['subsample'])

    # Setting for the neural network
    model_name = str(lower(config['model']))
    num_neurons = int(config['neurons'])
    num_layers = int(config['layers'])
    num_neurons_list = [num_neurons for i in range(num_layers)]
    activation = str(lower(config['activation']))
    use_bias = bool(config['bias'])

    # Generic parameters for optimizer
    optimizer_name = str(lower(config['optimizer']))
    epochs = int(config['epochs'])
    learning_rate = float(config['learning_rate'])
    threshold = float(config['threshold'])
Beispiel #23
0
        createdON = createdArr[0]

    if len(nStartArr) > 0:
        startAt = nStartArr[0]
        startON = nStartArr[1]

    if len(endArr) > 0:
        endAt = endArr[0]
        #print(row['nEnd'])
    # endOn = endArr[1]

#Removing all non-numeric characters from string in Python    re.sub("[^0-9]", "", "sdkjh987978asd098as0980a98sd")
#print(re.sub("[^0-9]", "", row['SumInsured']))
#re.sub("[^0-9]", "", row['Premium'])

    branch = lower(str(row['BinderNo'])[0:4])

    if "kin" in str(lower(str(row['BinderNo'])[0:5])):
        branch = "Kingston"
    if "por" in str(lower(str(row['BinderNo'])[0:5])):
        branch = "Port Antonio"
    if "fal" in str(lower(str(row['BinderNo'])[0:5])):
        branch = "Falmouth"
    if "old" in str(lower(str(row['BinderNo'])[0:5])):
        branch = "Old Harbour"
    if "och" in str(lower(str(row['BinderNo'])[0:5])):
        branch = "Ocho Rios"
    if "con" in str(lower(str(row['BinderNo'])[0:5])):
        branch = "Constant Spring Road"
    if "spa" in str(lower(str(row['BinderNo'])[0:5])):
        branch = "Spanish Town"
 def GetClosesWords(self, s1):
     closest_words = self.w_2_vec_util.GetClosestWords(lower(s1))
     return closest_words
Beispiel #25
0
    def main():
        # game starts here
        players = []
        # max_nb_cards = 2 # also the limit to regular move testing
        max_nb_cards = 12  # also the limit to regular move
        game = Game(False, max_nb_cards, players)
        objective = ''
        # auto gen for testing
        # game.__players.append(Player('kenlo', 1, game.__nb_cards))
        # game.__players[0].setObjective('colors')
        # game.__players.append(Player('bot', 2, game.__nb_cards))
        # game.__players[1].setObjective('dots')

        alpha_beta = None
        reply = ''
        while reply != 'y' or reply != 'n':
            msg = 'Activate Alpha-Beta ? (Y/N) '
            usr_input = input(msg)
            reply = str(lower(usr_input))
            if reply == 'y':
                alpha_beta = True
                break
            if reply == 'n':
                alpha_beta = False
                break

        trace = None
        reply = ''
        while reply != 'y' or reply != 'n':
            msg = 'Generate an Output Trace ? (Y/N) '
            usr_input = input(msg)
            reply = str(lower(usr_input))
            if reply == 'y':
                trace = True
                break
            if reply == 'n':
                trace = False
                break

        if trace:
            trace_file = game.createOutputFile(alpha_beta)
            trace_file.close()
        else:
            trace_file = None

        ai_turn = 0
        while reply != '1' or reply != '2':
            msg = 'The A.I will be Player 1 or 2 ? (1/2) '
            usr_input = input(msg)
            reply = usr_input
            if (reply == '1' or reply == '2'):
                ai_turn = int(reply)
                break

        for i in range(1, 3):

            if ai_turn == i:
                # bot = AIPlayer(2, 'Bot', i, max_nb_cards, 'ai')
                # bot = AIPlayer(2, 'Bot', i, max_nb_cards, 'ai') #todo: 2 LEVEL DEEP
                bot = AIPlayer(1, 'Bot', i, max_nb_cards,
                               'ai')  #todo: 1 LEVEL DEEP
                # print('bot name = ', bot.name())
                players.append(bot)

            else:
                msg = 'Player ' + str(i) + ' name: '
                usr_input = input(msg)
                name = str(usr_input)
                player = Player(name, i, max_nb_cards, 'human')
                players.append(player)

            if i == 1:

                # if ai_turn == i:
                #    random_int = randint(0,20)
                #    if random_int % 2 == 0:
                #        players[i - 1].setObjective('colors')
                #        print('Bot chose colors as objective.')
                #    else:
                #        players[i - 1].setObjective('dots')
                #        print('Bot chose dots as objective.')
                #
                # else:
                while objective != 'colors' or objective != 'dots':
                    msg = 'Player ' + str(i) + ' objective (colors or dots): '
                    usr_input = input(msg)
                    objective = str(lower(usr_input))
                    if objective == 'colors' or objective == 'dots':
                        players[i - 1].setObjective(objective)
                        break

            else:
                if (players[i - 2].objective() == 'colors'
                    ):  #player 1 at position 0 in list
                    players[i - 1].setObjective(
                        'dots')  #player 2 at position 1 in list
                elif (players[i - 2].objective() == 'dots'):
                    players[i - 1].setObjective('colors')

            # print(players)

        # new blank board
        brd_1 = Board(8, 12)
        brd_1.setBoard()
        brd_1.printBoard()

        # cards history
        played_cards = []
        # print(game.__hasWinner)
        turn_count = 0
        # end_game = 30
        end_game = 20
        # regular turns loop
        while game.__hasWinner == False:
            turn_count += 1
            print('Round #', turn_count, sep='')
            print('=' * 36)
            print('')
            turnP1 = Turn(alpha_beta, trace, turn_count, max_nb_cards,
                          end_game, brd_1, players[0], played_cards,
                          trace_file)
            turnResult = turnP1.start()
            game.checkResult(turnResult)
            if game.__hasWinner:
                break
            turnP2 = Turn(alpha_beta, trace, turn_count, max_nb_cards,
                          end_game, brd_1, players[1], played_cards,
                          trace_file)
            turnResult = turnP2.start()
            game.checkResult(turnResult)
            if game.__hasWinner:
                break
            if turn_count == end_game:
                print('Game Over. It ended in a draw.')
                break
            print('End of Round #', turn_count, sep='')
            print('=' * 36)
            print('')
Beispiel #26
0
    points.append(left)
    points.append(right)


#Checks if the program is closed out of.
def isQuit():
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
            sys.exit()


#Prompts the user if they want to see each segment of the snowflake be drawn or just each iteration be drawn.
print("\n")
st = input("slow draw? (Y/N): ")
slowDraw = (lower(st) == "y")

#Main Loop
while True:
    isQuit()
    num = 0
    resetPoints()
    while num < 6:
        isQuit()
        display_surf.fill(BLACK)
        if slowDraw:
            for i in range(len(points)):
                isQuit()
                if i == len(points) - 1:
                    next = points[0]
                else:
Beispiel #27
0
def sentiment_model():
    print("Predicting Sentiments")
    imdb_df_pd = session.execute('SELECT *  FROM movie_sent')

    #imdb_df_pd = pd.read_csv("IMDB_Dataset.csv")

    for col in imdb_df_pd.columns:
        if imdb_df_pd[col].dtypes == 'object':
            imdb_df_pd[col] = imdb_df_pd[col].astype('str')

        #imdb_df_pd.head(10)
    imdb_df = sqlContext.createDataFrame(imdb_df_pd)

    # Print the schema in a tree format
    #imdb_df.printSchema()
    #Categorize  sentiment to 0 or 1
    indexer = StringIndexer(inputCol="sentiment", outputCol="score")
    imdb_df = indexer.fit(imdb_df).transform(imdb_df)
    imdb_df = imdb_df.drop("sentiment")
    #imdb_df.show()

    imdb_df = imdb_df.select(
        regexp_replace('review', '[!?.;:#-/<>]+', ' ').alias('review'),
        'score')
    imdb_df = imdb_df.select(
        regexp_replace('review', '\"', ' ').alias('review'), 'score')
    imdb_df = imdb_df.select(
        regexp_replace('review', ',', ' ').alias('review'), 'score')
    imdb_df = imdb_df.select(
        regexp_replace('review', '(\'s\s+)', ' ').alias('review'), 'score')
    imdb_df = imdb_df.select(
        regexp_replace('review', '(\'\s+)', ' ').alias('review'), 'score')
    #imdb_df.show()

    # remove all single characters
    imdb_df = imdb_df.select(
        regexp_replace('review', '\s+[a-zA-Z]\s+', ' ').alias('review'),
        'score')

    # remove single characters from the start
    imdb_df = imdb_df.select(
        regexp_replace('review', '^[a-zA-Z]\s+', '').alias('review'), 'score')

    # remove single digit
    imdb_df = imdb_df.select(
        regexp_replace('review', '[0-9]+', ' ').alias('review'), 'score')

    #Substituting multiple spaces with single space
    imdb_df = imdb_df.select(
        regexp_replace('review', '\s+', ' ').alias('review'), 'score')

    # Converting to Lowercase
    imdb_df = imdb_df.select(lower(imdb_df.review).alias('review'), 'score')

    #imdb_df.show()

    # Tokenize text

    tokenizer = Tokenizer(inputCol='review', outputCol='words_token')

    df = tokenizer.transform(imdb_df)
    # Remove stop words
    remover = StopWordsRemover(inputCol='words_token',
                               outputCol='words_clean',
                               caseSensitive=False)

    # df = remover.transform(df)

    # df.show(10)
    cv = CountVectorizer(inputCol="words_clean",
                         outputCol="tf",
                         vocabSize=2**17,
                         minDF=5.0)

    # we now create a pipelined transformer
    cv_pipeline = Pipeline(stages=[tokenizer, remover, cv]).fit(imdb_df)

    #cv_pipeline.transform(imdb_df).show(5)

    idf = IDF(inputCol="tf", outputCol="idf")

    idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(imdb_df)
    #idf_pipeline.transform(imdb_df).show(5)

    training_df, validation_df, testing_df = imdb_df.randomSplit(
        [0.6, 0.3, 0.1], seed=0)

    #print(training_df.count(), validation_df.count(), testing_df.count())

    lr = LogisticRegression(maxIter=50,
                            regParam=0.0,
                            elasticNetParam=0.0,
                            featuresCol="idf",
                            labelCol="score")
    lr_pipeline = Pipeline(stages=[idf_pipeline, lr]).fit(training_df)
    print("Prediction Accuracy before Tuning")
    lr_pipeline.transform(validation_df). \
        select(expr('float(prediction = score)').alias('correct')). \
        select(avg('correct').alias('Accuracy')).show()

    # identify noise in the model
    vocabulary = cv_pipeline.stages[2].vocabulary
    # vocabulary = idf_pipeline.stages[0].stages[2].vocabulary
    #print(vocabulary)
    weights = lr_pipeline.stages[1].coefficients.toArray()
    #print(weights)
    coeffs_df = pd.DataFrame({'word': vocabulary, 'weight': weights})
    #print(coeffs_df.sort_values('weight').head(5))
    #print(coeffs_df.sort_values('weight', ascending=False).head(5))
    # Fit the model

    # data is overfitted
    # modify the loss function and penalize weight values that are too large.
    # use either L! (Lasso) or Ridge(L2)
    from pyspark.ml.tuning import ParamGridBuilder

    #evlue the mode and find best fit model. done seperately

    # best parameters are regParam = 0.01 and  name='elasticNetParam' = 0.2

    lr = LogisticRegression(maxIter=50,
                            regParam=0.01,
                            elasticNetParam=0.2,
                            featuresCol="idf",
                            labelCol="score")
    lr_pipeline_fitted = Pipeline(stages=[idf_pipeline, lr]).fit(training_df)

    print("Prediction Accuracy - After Tuning")
    lr_pipeline_fitted.transform(validation_df). \
        select(expr('float(prediction = score)').alias('correct')). \
        select(avg('correct').alias('accuracy')).show()

    # identify noise in the model

    vocabulary = cv_pipeline.stages[2].vocabulary
    #print(vocabulary)
    weights = lr_pipeline_fitted.stages[1].coefficients.toArray()
    #print(weights)
    coeffs_df = pd.DataFrame({'word': vocabulary, 'weight': weights})
    #print(coeffs_df.sort_values('weight').head(5))
    #print(coeffs_df.sort_values('weight', ascending=False).head(5))
    return lr_pipeline_fitted
    print("end of sentiment model")
def CleanString(string):
    new_string = ''.join(e for e in string if e.isalnum())
    new_string = lower(new_string)
    return str(new_string)
Beispiel #29
0
    def where_is(self, patterns, df=None, union=True, columns=None,
                 exact=False, case_sensitive=False):
        """Find a list of string patterns in a DataFrame.

        Parameters
        ----------
        patterns : list
            List of string patterns to search.
        df : pd.DataFrame | None
            The DataFrame to use. If None, the DataFrame of the ROI are going
            to be used by default.
        union : bool | True
            Take either the union of matching patterns (True) or the
            intersection (False).
        columns : list | None
            List of specific column names to search in. If None, this method
            search through the entire DataFrame.
        exact : bool | False
            Specify if the pattern to search have to be exact matching (True)
            or if the pattern is only a part of the result.
        case_sensitive : bool | False
            Specify if the search have to be case sensitive.

        Returns
        -------
        idx : list
            List of index that match with the list of patterns.
        """
        # Check inputs :
        assert isinstance(patterns, (str, list, tuple))
        df_to_use = self.ref if df is None else df
        is_pandas_installed(raise_error=True)
        import pandas as pd
        assert isinstance(df_to_use, pd.DataFrame)
        patterns = [patterns] if isinstance(patterns, str) else patterns
        patterns = list(patterns)
        if columns is not None:
            df_to_use = df_to_use[columns]
        dfarr = np.array(df_to_use).astype(str)
        # Case sensitive :
        if not case_sensitive:
            dfarr = npchar.lower(dfarr)
            patterns = npchar.lower(np.array(patterns).astype(str))
        # Define the matching function :
        if exact:
            def match(x, pat): return np.any(x == pat, axis=1)  # noqa
        else:
            def match(x, pat):
                return np.any((npchar.find(x, pat) + 1).astype(bool), axis=1)
        # Locate patterns :
        idx_to_keep = np.zeros((dfarr.shape[0], len(patterns)), dtype=bool)
        for k, p in enumerate(patterns):
            idx_to_keep[:, k] = match(dfarr, str(p))
        # Return either the union or intersection across research :
        fcn = np.any if union else np.all
        idx_to_keep = fcn(idx_to_keep, 1)
        if not np.any(idx_to_keep):
            logger.error("No corresponding entries in the %s ROI for "
                         "%s" % (self.name, ', '.join(patterns)))
            return []
        else:
            idx_roi = np.array(df_to_use['index'].loc[idx_to_keep]).astype(int)
            return idx_roi.tolist()
def compareMethods(waveR, waveC, parametersR, parametersC, regionR, regionC):
    # FUNCTION PURPOSE: Get user input to compare results from two methods based on their hodographs
    #
    # INPUTS:
    #   wave: Dictionary containing wavelet transformed surfaces, for rectangle (R) and contour (C) methods
    #   parameters: Dictionary containing wave parameters, for R and C methods
    #   region: Boolean mask tracing the wave on the power surface, for R and C methods
    #
    # OUTPUTS:
    #   parameters: Dictionary containing wave parameters, for the chosen method
    #   region: Boolean mask tracing the wave on the power surface, for the chosen method


    # First, filter based on half-max wind variance, from Murphy (2014)

    # Calculate the wind variance of the wave
    windVarianceR = np.abs(waveR.get('uTrim')) ** 2 + np.abs(waveR.get('vTrim')) ** 2
    windVarianceC = np.abs(waveC.get('uTrim')) ** 2 + np.abs(waveC.get('vTrim')) ** 2

    # Get rid of values below half-power, per Murphy (2014)
    uR = waveR.get('uTrim').copy()[windVarianceR >= 0.5 * np.max(windVarianceR)]
    vR = waveR.get('vTrim').copy()[windVarianceR >= 0.5 * np.max(windVarianceR)]
    uC = waveR.get('uTrim').copy()[windVarianceC >= 0.5 * np.max(windVarianceC)]
    vC = waveR.get('vTrim').copy()[windVarianceC >= 0.5 * np.max(windVarianceC)]

    # Discard imaginary components, which aren't needed for hodograph
    uR = uR.real
    vR = vR.real
    uC = uC.real
    vC = vC.real

    # Now, create hodograph subplots for easy comparison
    fig, ax = plt.subplots(1, 2)
    fig.suptitle('Which Hodograph Looks Better?')
    ax[0].plot(uR, vR)
    ax[0].set_title('Rectangle Peak Trace Method')
    ax[1].plot(uC, vC)
    ax[1].set_title('Contour Peak Trace Method')
    plt.show()

    # Get user input for selection
    print("\r\nPlease enter the name of the method that showed a more elliptical shape:")

    # userInput starts as empty string
    userInput = ""

    # While userInput remains empty, get input
    while not userInput:
        userInput = lower(input())

        # If input isn't either "rectangle" or "contour", set userInput to empty string
        if userInput != "rectangle" and userInput != "contour" and userInput != "r" and userInput != "c":
            # Console output to let user know requirements if they don't answer right
            print("Please enter either 'rectangle' or 'contour':")
            userInput = ""

    # Now that the loop has finished, return correct parameters and region
    if userInput == "rectangle" or userInput == "r":
        return parametersR, regionR
    else:
        return parametersC, regionC
def evaluate(base_data_path, movement_data_path, settings, real_change, CALCULATE_ERROR):
    list_index = []
    list_R = []
    list_t = []
    list_inliers = []

    movement_type = lower(base_data_path.split("/")[3])

    print(base_data_path)
    print(movement_data_path)

    for image_i in range(1, 100):
        print(image_i)

        #
        # Predict Pose Change
        #
        try:
            _, final_r, final_t, inliers, _ = predict_pose_change(base_data_path.format(image_i), movement_data_path.format(image_i), settings, real_change, CALCULATE_ERROR=False, print_image=True)

            list_index.append(image_i)
            list_R.append(final_r)
            list_t.append(final_t)
            list_inliers.append(inliers)
        except Exception as e:
            print("some exception occured. skipping.")

    R_bar = np.array(list_R)
    t_bar = np.array(list_t)

    XYZ_mean = t_bar.mean(axis=0)
    XYZ_std = t_bar.std(axis=0)
    RPY_mean = R_bar.mean(axis=0)
    RPY_std = R_bar.std(axis=0)

    #
    # Plot CDF
    #
    # if movement_type == 'translation':
    plt.figure(figsize=[7, 7])
    plt.subplot(2,1,1)
    plot_cdf((t_bar), "Translation", movement_type, real_change, xyz=True)
    # else:
    plt.subplot(2,1,2)
    plot_cdf((R_bar), "Rotation", movement_type, real_change, rpy=True)
    plt.show()

    #
    # Plot all predictions per Rotation and Translation
    #
    plt.figure(figsize=(12, 8))
    plt.subplot(1, 2, 1)
    plot_means(R_bar, RPY_mean, RPY_std, "Rotation")
    plt.xlabel("image-pair index")
    plt.ylabel("prediction")
    plt.subplot(1, 2, 2)
    plot_means(t_bar, XYZ_mean, XYZ_std, "Translation")
    plt.ylabel("prediction")
    plt.xlabel("image-pair index")
    plt.suptitle("{}\n{}".format(base_data_path, movement_data_path))
    plt.show()

    plt.plot((t_bar))
    for i in range(len(list_index)):
        print()
        plt.text(i, (t_bar)[i][0], "#" + str(list_index[i]))
    plt.legend(["X", "Y", "Z"])
    plt.ylabel("prediction")
    plt.show()

    plt.plot((R_bar))
    for i in range(len(list_index)):
        print()
        plt.text(i, (R_bar)[i][0], "#" + str(list_index[i]))
    plt.legend(["φ", "θ", "ψ", ])
    plt.ylabel("prediction")
    plt.show()

    #
    # Plot number of inliers used.
    #
    print()
    print()
    X = [sum(x) for x in list_inliers]
    X_indices = np.argsort(X)
    X = np.array(X)[X_indices]
    M = np.array(list_index)[X_indices]
    plt.plot(range(len(M)), X, 'ro')
    plt.plot(range(len(M)), X, 'k.')
    for i in range(len(M)):
        print("list_index", M)
        print(i)
        plt.text(i, X[i] + 0.25, str(M[i]))
    plt.xlabel("image-pair index")
    plt.ylabel("Number of inliers")
    plt.title("number of inliers (sorted)")
    plt.show()
    print("X_indices", X_indices)
    print("list_index", M)

    #
    # Plot number of inliers used.
    #
    print()
    print()
    X = [sum(x) for x in list_inliers]
    X_indices = np.argsort(X)
    X = np.array(X)[X_indices]
    M = np.array(list_index)[X_indices]
    plt.plot(range(len(M)), X, 'ro')
    plt.plot(range(len(M)), X, 'k.')
    for i in range(len(M)):
        print("list_index", M)
        print(i)
        plt.text(i, X[i] + 0.25, str(M[i]))
    plt.xlabel("image-pair index")
    plt.ylabel("Number of inliers")
    plt.title("number of inliers (sorted)")
    plt.show()
    print("X_indices", X_indices)
    print("list_index", M)

    return [
        XYZ_mean[0], XYZ_std[0],  # X
        XYZ_mean[1], XYZ_std[1],  # Y
        XYZ_mean[2], XYZ_std[2],  # Z
        RPY_mean[0], RPY_std[0],  # φ
        RPY_mean[1], RPY_std[1],  # θ
        RPY_mean[2], RPY_std[2],  # ψ
    ]
Beispiel #32
0
def run():
    # Create a SmmryAPI call and pass in SMMRY API Key
    smmry = SmmryAPI(SMMRY_API_KEY)

    # text wrapper
    wrapper = TextWrapper(width=200,
                          initial_indent="          ",
                          subsequent_indent="             ")

    while True:
        os.system('clear')
        printBanner()

        query = input(BOLD + 'Enter search query: ' + RESET)
        connection = urllib.request.urlopen(
            'http://localhost:8983/solr/csce470/select?q=' +
            urllib.parse.quote_plus(query) + '&rows=100000')
        response = json.load(connection)
        numFound = response['response']['numFound']

        # print query data
        print('   + query time: ' + YELLOW +
              str(response['responseHeader']['QTime']) + ' ms' + RESET)
        print('   + documents found: ' + YELLOW + str(numFound) + RESET)

        index = 0
        while True:
            # 10 search results
            end = 10
            if (index + 10) >= numFound:
                end = 10 - ((index + 10) - numFound)
            print('   + search results (showing ' + str(index + 1) + '-' +
                  str(index + end) + ' out of ' + str(numFound) + '):')
            for x in range(index, index + end):
                try:
                    print('{0: <223}'.format(
                        (BG_WHITE + BLACK + '     [' + str(x + 1) + ']: ' +
                         RESET + BG_WHITE + BLACK +
                         str(response['response']['docs'][x]['title']))) +
                          RESET)
                    if DONT_CALL_API == False:
                        article = smmry.summarize(str(
                            response['response']['docs'][x]['url']),
                                                  sm_length=SUMMARY_LEN)
                        print(
                            wrapper.fill(('Summary: ' + GREEN +
                                          str(article.sm_api_content))) +
                            RESET)
                    else:
                        print('Summary: [DONT_CALL_API set to False]')
                except SmmryAPIException as e:
                    #print('            error ' + str(e))
                    print(wrapper.fill(RED + '[Could not summarize]' + RESET))
                except Exception as e:
                    print(wrapper.fill(RED + '[Could not summarize]' + RESET))
                finally:
                    print(
                        wrapper.fill(
                            'URL: ' + UNDERLINE + BLUE +
                            str(response['response']['docs'][x]['url']) +
                            RESET))
            # next steps loop
            answer = ''
            while True:
                answer = input(BOLD + 'View more? (prev/next/new): ' + RESET)
                if lower(answer) == 'prev':
                    if (index - 10) < 0:
                        print('Can\'t go back anymore. Try again.')
                    else:
                        index = index - 10
                        break
                elif lower(answer) == 'next':
                    if (index + 10) >= numFound:
                        print('Can\'t go forward anymore. Try again.')
                    else:
                        index = index + 10
                        break
                elif lower(answer) == 'new':
                    break
                else:
                    print('Invalid command. Try again.')
            # start new query
            if answer == 'new':
                break
            else:
                os.system('clear')
                printBanner()
                print(BOLD + 'Enter search query: ' + RESET + query)

        print()
Beispiel #33
0
    def where_is(self,
                 patterns,
                 df=None,
                 union=True,
                 columns=None,
                 exact=False,
                 case_sensitive=False):
        """Find a list of string patterns in a DataFrame.

        Parameters
        ----------
        patterns : list
            List of string patterns to search.
        df : pd.DataFrame | None
            The DataFrame to use. If None, the DataFrame of the ROI are going
            to be used by default.
        union : bool | True
            Take either the union of matching patterns (True) or the
            intersection (False).
        columns : list | None
            List of specific column names to search in. If None, this method
            search through the entire DataFrame.
        exact : bool | False
            Specify if the pattern to search have to be exact matching (True)
            or if the pattern is only a part of the result.
        case_sensitive : bool | False
            Specify if the search have to be case sensitive.

        Returns
        -------
        idx : list
            List of index that match with the list of patterns.
        """
        # Check inputs :
        assert isinstance(patterns, (str, list, tuple))
        df_to_use = self.ref if df is None else df
        is_pandas_installed(raise_error=True)
        import pandas as pd
        assert isinstance(df_to_use, pd.DataFrame)
        patterns = [patterns] if isinstance(patterns, str) else patterns
        patterns = list(patterns)
        if columns is not None:
            df_to_use = df_to_use[columns]
        dfarr = np.array(df_to_use).astype(str)
        # Case sensitive :
        if not case_sensitive:
            dfarr = npchar.lower(dfarr)
            patterns = npchar.lower(np.array(patterns).astype(str))
        # Define the matching function :
        if exact:

            def match(x, pat):
                return np.any(x == pat, axis=1)  # noqa
        else:

            def match(x, pat):
                return np.any((npchar.find(x, pat) + 1).astype(bool), axis=1)

        # Locate patterns :
        idx_to_keep = np.zeros((dfarr.shape[0], len(patterns)), dtype=bool)
        for k, p in enumerate(patterns):
            idx_to_keep[:, k] = match(dfarr, str(p))
        # Return either the union or intersection across research :
        fcn = np.any if union else np.all
        idx_to_keep = fcn(idx_to_keep, 1)
        if not np.any(idx_to_keep):
            logger.error("No corresponding entries in the %s ROI for "
                         "%s" % (self.name, ', '.join(patterns)))
            return []
        else:
            idx_roi = np.array(df_to_use['index'].loc[idx_to_keep]).astype(int)
            return idx_roi.tolist()