def index(request):
    if request.method == 'POST' and 'csvFile' in request.FILES:
        csv = request.FILES['csvFile']
        csv_file_name = str(request.FILES['csvFile'])
        parsed_csv = parse_csv(csv)
        if parsed_csv == 'Not a CSV':
            return JsonResponse({"notCsv": "true"})
        header = parsed_csv.pop(0)
        response_data = create_response(parsed_csv, header, False)
        context = {"html": response_data['html'], "data": response_data["unique_mpn"]}
        return JsonResponse(context)
    elif len(request.POST.getlist('rawHTML')) != 0:
        html = request.POST.getlist("rawHTML")[0]
        parsed_html = parse_html(html)
        head = [value.get_text() for value in parsed_html['header']]
        table = get_html_text(parsed_html['rows'])
        if request.POST.getlist('sort')[0]  == "false":
            file_name = request.POST.getlist("filename")[0]
            for row in table[2:-1]:
                Build_Data.objects.create(file_name=file_name, designator=row[0], footprint=row[1], mid_x=row[2], mid_y=row[3], ref_x=row[4], ref_y=row[5], pad_y=row[6], pad_x=row[7], layer=row[8], rotation=row[9], comment=row[10])
            return HttpResponse("")
        else:       
            response_data = create_response(table, head, True)
            context = {"html": response_data['html'], "data": response_data["unique_mpn"]}
            return JsonResponse(context)
    return render(request, 'home.html')
Ejemplo n.º 2
0
def main():
    args = get_args()
    utils.configure_logging(verbose=args.verbose,
                            debug=args.debug,
                            error=args.error)
    session = http_session.FingertipsSession()

    # Get data
    if args.indicator_id:
        lines = objects.Data.by_indicator_id(
            session,
            indicator_ids={args.indicator_id},
            child_area_type_id=args.area_type_id,
            parent_area_type_id=args.parent_area_type_id)

    elif args.profile_id:
        lines = objects.Data.by_profile_id(
            session,
            child_area_type_id=args.area_type_id,
            parent_area_type_id=args.parent_area_type_id,
            profile_id=args.profile_id)
    else:
        raise argparse.ArgumentError(
            None, 'Either indicator_id or profile_id are required')

    rows = utils.parse_csv(lines)
    # Filter
    rows = (row for row in rows if row_filter(row, args=args))

    # Write to file (or to screen)
    buffer = args.output.open('w', newline='\n') if args.output else sys.stdout
    utils.write_csv(rows, buffer=buffer, write_header=args.write_header)
Ejemplo n.º 3
0
def show_report(report_id):
    custom_report = CustomReport.query.get(report_id)

    if request.method == 'POST':
        session['username'] = request.form['username']
        session['password'] = request.form['password']

    username = session.get('username', None)
    password = session.get('password', None)

    if username and password:
        csv_document = utils.get_csv_from_url(custom_report.url,
                                              username=username,
                                              password=password)

        if '<!DOCTYPE html PUBLIC' in csv_document:
            session['username'] = None
            session['password'] = None
            return render_template('reports/report.html',
                                   require_auth=True,
                                   failure=True)

        reports = utils.parse_csv(csv_document)
        return render_template('reports/report.html', reports=reports)
    else:
        return render_template('reports/report.html', require_auth=True)
Ejemplo n.º 4
0
def main(file_path, logger_p):

    # read input file - returns list of lists
    data_records = parse_csv(file_path,
                             smoothing_level=0,
                             should_shuffle=False)
    logger_p.log(
        '{num_records} records loaded'.format(num_records=len(data_records)))

    # # run offline predictors
    # offline_predictor_errors = \
    #     run_offline_predictors(
    #         logger_p,
    #         data_records,
    #         [
    #             (OfflineAutoRegressionHandler, 1),
    #             # (MovingAverageHandler, 1),
    #         ]
    #     )

    # run offline predictors
    predictor = OnlineAutoRegressionHandler(logger_p, p=1, lag_size=13)
    online_predictor_errors = \
        run_online_predictor(logger_p, data_records, predictor=predictor)

    # log results
    # utils.log_metrics_dict(logger_p, offline_predictor_errors)
    utils.log_metrics_dict(logger_p, online_predictor_errors)
Ejemplo n.º 5
0
def main():
    movies = parse_csv()
    with open("ebert.csv", 'w') as out:
        writer = csv.writer(out)
        writer.writerow(EBERT_FIELDS)
        for title, year in movies:
            result = get_ebert_ratings(title, year)
            writer.writerow([result[f] for f in EBERT_FIELDS])
Ejemplo n.º 6
0
def main():
    movies = parse_csv()
    with open("ebert.csv", 'w') as out:
        writer = csv.writer(out)
        writer.writerow(EBERT_FIELDS)
        for title, year in movies:
            result = get_ebert_ratings(title, year)
            writer.writerow([result[f] for f in EBERT_FIELDS])
Ejemplo n.º 7
0
def calc_rate(logger):
    logger.log('load records..')
    data_records = utils.parse_csv(DATASET_FILE_PATH,
                                   smoothing_level=1,
                                   should_shuffle=False)
    logger.log('calculate rates..')
    logger.log(utils.calc_mid_end_rate(data_records))
    logger.log('done.')
Ejemplo n.º 8
0
def main():
    movies = parse_csv()
    with open("metacritic.csv", 'w') as out:
        writer = csv.writer(out)
        writer.writerow(METACRITIC_FIELDS)
        for title, year in movies:
            result = get_metacritic_ratings(title, year)
            writer.writerow([result[f].encode("utf-8") for f in METACRITIC_FIELDS])
Ejemplo n.º 9
0
    def __init__(self,
                 examples=None,
                 attrs=None,
                 attr_names=None,
                 target=-1,
                 inputs=None,
                 values=None,
                 distance=mean_boolean_error,
                 name='',
                 source='',
                 exclude=()):
        """Accepts any of DataSet's fields. Examples can also be a
        string or file from which to parse examples using parse_csv.
        Optional parameter: exclude, as documented in .set_problem().
        >>> DataSet(examples='1, 2, 3')
        <DataSet(): 1 examples, 3 attributes>"""
        self.name = name
        self.source = source
        self.values = values
        self.distance = distance
        self.got_values_flag = bool(values)

        # initialize .examples from string, file, or list
        if isinstance(examples, str):
            self.examples = parse_csv(examples)
        else:
            try:
                self.examples = parse_csv(
                    open('datasets/' + name + '.csv').read())
            except FileNotFoundError:
                self.examples = examples

        # attrs are the indices of examples, unless otherwise stated.
        if self.examples and not attrs:
            attrs = list(range(len(self.examples[0])))

        self.attrs = attrs

        # initialize .attr_names from string, file, or by default
        if isinstance(attr_names, str):
            self.attr_names = attr_names.split()
        else:
            self.attr_names = attr_names or attrs

        self.set_problem(target, inputs=inputs, exclude=exclude)
Ejemplo n.º 10
0
def main():
    movies = parse_csv()
    with open("metacritic.csv", 'w') as out:
        writer = csv.writer(out)
        writer.writerow(METACRITIC_FIELDS)
        for title, year in movies:
            result = get_metacritic_ratings(title, year)
            writer.writerow(
                [result[f].encode("utf-8") for f in METACRITIC_FIELDS])
Ejemplo n.º 11
0
def main():
    movies = parse_csv()
    with open("rottentomatoes.csv", 'w') as out:
        writer = csv.writer(out)
        writer.writerow(RT_FIELDS)
        for title, year in movies:
            result = get_rottentomatoes_ratings(title, year)
            writer.writerow([result[f] for f in RT_FIELDS])
            time.sleep(1)
Ejemplo n.º 12
0
def main():
    movies = parse_csv()
    with open("rottentomatoes.csv", 'w') as out:
        writer = csv.writer(out)
        writer.writerow(RT_FIELDS)
        for title, year in movies:
            result = get_rottentomatoes_ratings(title, year)
            writer.writerow([result[f] for f in RT_FIELDS])
            time.sleep(1)
Ejemplo n.º 13
0
def main():
    movies = parse_csv()
    with open('box_office_mojo.csv', 'w') as out:
        writer = csv.writer(out)
        writer.writerow(result_fields)
        for m in movies:
            title, year = m
            html = fetch_site(title, year)
            result = process_site(html, title, year)
            writer.writerow([result[f] for f in result_fields])
Ejemplo n.º 14
0
def main():
    movies = parse_csv()
    with open('box_office_mojo.csv', 'w') as out:
        writer = csv.writer(out)
        writer.writerow(result_fields)
        for m in movies:
            title, year = m
            html = fetch_site(title, year)
            result = process_site(html, title, year)
            writer.writerow([result[f] for f in result_fields])
Ejemplo n.º 15
0
def main(args):
    if args.load_from is not None:
        parsed_articles = parse_csv(args.load_from)
        for parsed in parsed_articles:
            print "Storing parsed article %s" % parsed.article_id
            store(parsed)

    if args.build_model is not None:
        data = database.session.query(
            Article.article_id, Article.body).filter(Article.body != '').all()
        sim_model = SimilarityModel(redis_db, data, args.build_model)
        sim_model.save()
Ejemplo n.º 16
0
def collect_data(input_file="script.csv", output_file="movies.csv"):
    movies = parse_csv(input_file)
    with open(output_file, "w") as out:
        writer = csv.writer(out)
        writer.writerow(FIELDS)
        for title, year in movies:
            result = defaultdict(lambda: "N/A", {'title':title, 'year': year})
            result = get_box_office_mojo_results(title, year, result)
            result = get_rottentomatoes_ratings(title, year, result)
            result = get_ebert_ratings(title, year, result)
            result = get_metacritic_ratings(title, year, result)
            writer.writerow([unicode(result[f]).encode("utf-8") for f in FIELDS])
Ejemplo n.º 17
0
def process_file(file_path):
    utils.file_exists(file_path)
    filename, mod_time = utils.get_file_metadata(file_path)
    file_hash = utils.get_file_hash(filename, mod_time)
    if data.logic.file_already_proccessed(file_hash):
        log.info('The file {} has been already processed')
    else:
        log.info('Start processing {}'.format(filename))
        data.logic.create_file_processed(file_hash, filename, mod_time)
        input_records = utils.parse_csv(file_path)
        records_with_candidates = data.logic.infer_candidates(input_records)
        process_rows(file_hash, records_with_candidates)
        log.info('Finished processing {}'.format(filename))
Ejemplo n.º 18
0
def collect_data(input_file="script.csv", output_file="movies.csv"):
    movies = parse_csv(input_file)
    with open(output_file, "w") as out:
        writer = csv.writer(out)
        writer.writerow(FIELDS)
        for title, year in movies:
            result = defaultdict(lambda: "N/A", {'title': title, 'year': year})
            result = get_box_office_mojo_results(title, year, result)
            result = get_rottentomatoes_ratings(title, year, result)
            result = get_ebert_ratings(title, year, result)
            result = get_metacritic_ratings(title, year, result)
            writer.writerow(
                [unicode(result[f]).encode("utf-8") for f in FIELDS])
def algorithm_quality():
    A = []
    sample = parse_csv('samples/smaller_data.csv')

    # Step 0
    training_data_set, testing_data_set = data_set_division(sample)

    #
    # # Step 1
    # similarity_vectors = calc_similarity_vectors(tmp_val, training_data_set)
    #
    # # Step 2
    # category_similarity = calc_category_similarity(similarity_vectors)
    #
    # # Step 3
    # decision = calc_decision(category_similarity)

    # Step 4
    m0 = 0

    loading_total = len(testing_data_set)
    loading_current = 0

    # Step 4.1
    for category, passw_arr in testing_data_set.items():
        for x in passw_arr:
            # a)
            similarity_vectors = calc_similarity_vectors(x, training_data_set)

            # b)
            category_similarity = calc_category_similarity(similarity_vectors)

            # c)
            decision = calc_decision(category_similarity)

            # d)
            if decision == category:
                m0 += 1

        loading_current += 1
        print(loading_current / loading_total)

    # Step 4.2
    quality = m0 / len(testing_data_set)

    return f'Quality: {quality}'
Ejemplo n.º 20
0
def main():
    # The slope defines how many iterations the program has to go through before adding a new frame to the graph
    slope = 1
    # Does this need any explanation ?
    max_iterations = 1000
    # Learning ratio
    ratio = 0.1

    # Parsing CSV to build the data
    tmp_tab_x, tmp_tab_y = utils.parse_csv('data.csv', 'km', 'price')
    if (tmp_tab_x is None or tmp_tab_y is None):
        return

    # Creating new dataset for cost function test
    data = train_model.Dataset(tmp_tab_x, tmp_tab_y)
    plot_cost_function(data, max_iterations, ratio)

    # Creating new dataset for linear regression test
    data = train_model.Dataset(tmp_tab_x, tmp_tab_y)
    plot_linear_regression(data, slope, max_iterations, ratio)
Ejemplo n.º 21
0
def analyse(directory, survey_file, config_file):
    """Analyses survey responses

    Args:
        directory(str): path to the folder containing the excel file and config file
            For example: "./static/uploads/4SikvVjjqlWV44AW/"
        survey_file(str): name of survey file (excel/csv)
            For example: "responses.xlsx" or "responses.csv"
        config_file(str): name of config file
            For example: "config_file.txt"

    Returns:
        A dictionary mapping each survey question to the analysis of its
        responses.
    """
    if survey_file.endswith(".csv"):
        parsed_file = parse_csv(os.path.join(directory, survey_file))
    else:
        parsed_file = parse_excel(os.path.join(directory, survey_file))

    categorised_responses = categorise(
        parsed_file,
        parse_config(os.path.join(directory, config_file)),
    )
    analysis = {}
    analysed = None
    for qn, responses in categorised_responses.items():
        category = responses[0]
        list_of_responses = responses[1]
        if category == "numerical":
            analysed = ("numerical", numerical(list(map(int, list_of_responses))))
        elif category == "multicategorical":
            analysed = ("categorical", multi_categorical(list_of_responses))
        elif category == "categorical":
            analysed = ("categorical", categorical(list_of_responses))
        elif category == "openended":
            analysed = ("openended", openended(list_of_responses, directory))
        analysis[qn] = analysed
        analysed = None
    return analysis
def preprocess_csv(csv_file_name,
                   processed_file_name,
                   service_port,
                   test_file=False):
    with open(processed_file_name, 'w') as save_to_file:
        csv_writer = csv.writer(save_to_file,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
        for num_tweets, row in enumerate(parse_csv(csv_file_name,
                                                   service_port)):
            sentiment = int(row.field[0])
            tweet_id = row.field[1][1:-1]
            text = row.field[2]
            processed_text = preprocess_tweet(text)
            if not test_file:
                csv_writer.writerow([tweet_id, sentiment, processed_text])
            else:
                csv_writer.writerow([tweet_id, processed_text])
    print(
        f'Saved [{1+num_tweets}] processed tweets to [{processed_file_name}]')
    return processed_file_name
def classify(processed_csv, service_port, test_file=True, **params):
    positive_words = file_to_wordset(params.pop('positive_words'), service_port)
    negative_words = file_to_wordset(params.pop('negative_words'), service_port)
    predictions = []
    for row in parse_csv(processed_csv, service_port):
        tweet_id = row.field[1]
        tweet = row.field[2]
        if not test_file:
            label = row.field[0]
        pos_count, neg_count = 0, 0
        for word in tweet.split():
            if word in positive_words:
                pos_count += 1
            elif word in negative_words:
                neg_count += 1
        # print pos_count, neg_count
        prediction = 1 if pos_count >= neg_count else 0
        if test_file:
            predictions.append((tweet_id, prediction))
        else:
            predictions.append((tweet_id, int(label), prediction))
    return predictions
Ejemplo n.º 24
0
def main():
    # Get dataset - Get min-max values - Normalize values - Sort tabs - Set both thetas to 0
    tmp_tab_x, tmp_tab_y = utils.parse_csv('data.csv', 'km', 'price')
    if (tmp_tab_x is None or tmp_tab_y is None):
        return
    data = Dataset(tmp_tab_x, tmp_tab_y)

    # Get number of trainings and ratio
    loops = utils.parse_input_int(message=utils.bcolors.YELLOW + "How much trainings do you want our program to go through:\n" + utils.bcolors.ENDC)
    if loops == -1:
        return
    ratio = utils.parse_input_float(message=utils.bcolors.YELLOW + "At what rate:\n" + utils.bcolors.ENDC)
    if ratio == -1:
        return

    # train model
    for _ in range(loops):
        data.norm_theta0, data.norm_theta1 = train_model(ratio, data.norm_theta0, data.norm_theta1, data.norm_tab_x, data.norm_tab_y)

    # de-normalize thetas and save them
    data.theta0 = utils.de_normalize(data.norm_theta0, data.max_y, data.min_y)
    data.theta1 = utils.de_normalize(data.norm_theta1, data.max_y, data.min_y) / (data.max_x - data.min_x)
    utils.save_thetas(data.theta0, data.theta1)
Ejemplo n.º 25
0
def kmeans_run_all():
    pd.set_option('expand_frame_repr', True)
    pd.set_option('max_rows', 100)
    np.set_printoptions(precision=3, floatmode='fixed')
    for fn in c.ALL:
        k = c.ks[fn]
        t = 1
        df, class_id = parse_csv(fn)
        clusters, centroids = kmeans(df, k, t)
        results = evaluate_clusters(clusters, centroids, verbose=False)
        totals = results.sum()
        totals.name = c.TOTALS
        results = results.append(totals)
        sfn = strip_file_path(fn)
        print(f'\nSummary - {sfn}')
        print(results)
        for idx, (cluster, centroid) in enumerate(zip(clusters, centroids)):
            print(f'\nCluster {idx + 1}')
            print(f'Centroid: {centroid}')
            print(cluster)
        if 2 <= clusters[0].shape[1] <= 3:
            plot_clusters([df], np.array([df.mean().values]), f'kmeans {sfn}')
            plot_clusters(clusters, centroids, f'kmeans clustered {sfn}')
Ejemplo n.º 26
0
def main():
    '''
    Driver program
    '''
    # Preprocess and prep data to be manipulated
    #preprocess()
    attr, table = utils.parse_csv("clean_data.csv")
    utils.convert_data_to_numeric(table)

    # Gather attribute indexes, attribute domains, and classifying attribute index
    attr_indexes = list(range(len(attr)))
    class_index = attr_indexes.pop(len(attr) - 1)
    attr_domains = utils.get_attr_domains(table, attr, attr_indexes)

    # Naive Bayes
    #naive_bayes(table, attr, attr_indexes, class_index)

    # Decision Trees

    # k-Means Clustering
    attr_indexes = list(range(len(attr)))
    attr_domains = utils.get_attr_domains(table, attr, attr_indexes)
    utils.randomize_data(table)
    clustering(table, attr, attr_indexes, attr_domains)
Ejemplo n.º 27
0
def preprocess():
    '''
    KEEP ATTRIBUTES:
    'date_time_intake'
    'intake_type'
    'breed_intake'
    'color_intake'
    'date_time_outcome'
    'outcome_type'
    'outcome_age'
    'gender_intake'
    'fixed_outcome'
    'age_bucket'
    'retriever'
    'shepherd'
    'beagle'
    'terrier'
    'boxer'
    'poodle'
    'rottweiler'
    'dachshund'
    'chihuahua'
    'pitbull'
    'time_bucket'

    DELETE ATTRIBUTES:
    'animal_id'
    'name_intake'
    'date_time_intake'
    'found_location'
    'animal_type_intake'
    'intake_condition'
    'month_year_intake'
    'intake_sex'
    'age'
    'breed_intake'
    'color_intake'
    'name_outcome'
    'date_time_outcome'
    'month_year_outcome'
    'outcome_subtype'
    'outcome_sex'
    'outcome_age'
    'gender_outcome'
    'fixed_intake'
    'fixed_changed'
    'date_time_length'
    '''
    attr, table = utils.parse_csv("adoption_data.csv")

    # Preserve animal entries for dogs and classifying attribute entry
    animal_index = attr.index('animal_type_intake')
    class_index = attr.index('time_bucket')
    table = [
        row for row in table
        if row[animal_index] == 'Dog' and row[class_index] != ''
    ]

    # Remove all duplicate entries
    animal_ids = set()
    animal_id_index = attr.index('animal_id')
    gender_index = attr.index('gender_intake')
    for row in table:
        # Check for duplicates
        if row[animal_id_index] in animal_ids:
            table.remove(row)
        else:
            print(row[animal_id_index])
            animal_ids.add(row[animal_id_index])
        # Check that entry has gender
        if row[gender_index] == '':
            table.remove(row)

    dogs_data = copy.deepcopy(table)
    utils.write_csv('dogs_data.csv', attr, dogs_data)

    # Remove attributes not to be trained on from instances in the dataset
    remove_attr = [
        'animal_id', 'name_intake', 'date_time_intake', 'found_location',
        'animal_type_intake', 'month_year_intake', 'intake_sex', 'age',
        'breed_intake', 'color_intake', 'name_outcome', 'date_time_outcome',
        'month_year_outcome', 'outcome_subtype', 'outcome_sex', 'outcome_age',
        'gender_outcome', 'fixed_intake', 'fixed_changed', 'date_time_length'
    ]

    # Remove each attribute from all rows
    for col in remove_attr:
        index = attr.index(col)
        attr.pop(index)
        for row in table:
            row.pop(index)

    utils.write_csv('clean_data.csv', attr, table)
Ejemplo n.º 28
0
def sigmoid_test(logger, is_online, should_plot, lag_size):
    logger.log('load records..')
    data_records = utils.parse_csv(DATASET_FILE_PATH,
                                   smoothing_level=1,
                                   should_shuffle=False)

    logger.log('get predictions')
    model_error_metrics = dict()
    for record_id in range(len(data_records)):
        record = data_records[record_id]

        if len(record) < INITIAL_HISTORY_SIZE + NUMBER_OF_PREDICTIONS_AHEAD:
            continue

        else:

            if (record_id % LOGGING_INTERVAL) == 0:
                logger.log('* record #{record_id}'.format(record_id=record_id))

            # split to train and test sets
            train_set = record[:INITIAL_HISTORY_SIZE]
            test_set = record[INITIAL_HISTORY_SIZE:INITIAL_HISTORY_SIZE +
                              NUMBER_OF_PREDICTIONS_AHEAD]

            # fit model and calculate predictions
            sigmoid_predictions = list()
            if is_online:
                mid_max_rate = SigmoidCurve.MID_MAX_RATE
                for i in range(NUMBER_OF_PREDICTIONS_AHEAD):
                    tmp_history = train_set + test_set[:i]
                    next_prediction = \
                        SigmoidCurve.fit_and_predict_recursive(
                            tmp_history[-lag_size:], 1, mid_max_rate=mid_max_rate
                        )[0]
                    sigmoid_predictions.append(next_prediction)
                    mid_max_rate *= 0.85
                    mid_max_rate = max(1.7, mid_max_rate)

            else:
                sigmoid_predictions = \
                    SigmoidCurve.fit_and_predict_recursive(
                        train_set[-lag_size:], NUMBER_OF_PREDICTIONS_AHEAD
                    )

            # plot predictions
            if should_plot:
                utils.plot_graph_and_prediction(
                    train_set + test_set, sigmoid_predictions,
                    INITIAL_HISTORY_SIZE + 1,
                    'sigmoid__{record_id}'.format(record_id=record_id))

            error_metrics = utils.get_all_metrics(test_set,
                                                  sigmoid_predictions)
            for metric_name in error_metrics.keys():
                if metric_name not in model_error_metrics.keys():
                    model_error_metrics[metric_name] = list()
                model_error_metrics[metric_name].append(
                    error_metrics[metric_name])

    # log metrics
    logger.log('-- avg. performance:')
    utils.log_metrics_dict(logger, model_error_metrics)
#!/usr/bin/python 
# Script to retrain the network

from utils import parse_csv, simple_generator, plot_history
from cnn_models import simple_model, nvidia_net
from keras.models import load_model


# New dataset path
dataset_path='additional_data/'
training_samples, validation_samples=parse_csv(dataset_path,val_split=0.2,correction=0.15)

train_generator=simple_generator(training_samples,batch_size=32)
validation_generator=simple_generator(validation_samples,batch_size=32)

# Load the pretrained model
pretrained_model="model2.h5"
model=load_model(pretrained_model)
print("Re-training with: {} samples".format(len(training_samples[0])))

# Trin the mode with new dataset generator
hist_obj=model.fit_generator(train_generator, samples_per_epoch=len(training_samples[0]), 
	validation_data=validation_generator, nb_val_samples=len(validation_samples), nb_epoch=7)

retrained_model="model2_re.h5"
plot_name="retrainig.png"
model_arch="model_arch.png"

# Save retrained model
model.save(retrained_model)
# Plot history of trainig epochs
Ejemplo n.º 30
0
if __name__ == '__main__':
    if len(sys.argv) != 3:
        print('Usage: python stats.py <preprocessed-CSV> <service-port>')
        exit()
    num_tweets, num_pos_tweets, num_neg_tweets = 0, 0, 0
    num_mentions, max_mentions = 0, 0
    num_emojis, num_pos_emojis, num_neg_emojis, max_emojis = 0, 0, 0, 0
    num_urls, max_urls = 0, 0
    num_words, num_unique_words, min_words, max_words = 0, 0, 1e6, 0
    num_bigrams, num_unique_bigrams = 0, 0
    all_words = []
    all_bigrams = []
    csv_path = sys.argv[1]
    service_port = int(sys.argv[2])
    for num_tweets, row in enumerate(parse_csv(csv_path, service_port)):
        t_id = row.field[0]
        if_pos = int(row.field[1])
        tweet = row.field[2]
        if if_pos:
            num_pos_tweets += 1
        else:
            num_neg_tweets += 1
        result, words, bigrams = analyze_tweet(tweet)
        num_mentions += result['MENTIONS']
        max_mentions = max(max_mentions, result['MENTIONS'])
        num_pos_emojis += result['POS_EMOS']
        num_neg_emojis += result['NEG_EMOS']
        max_emojis = max(max_emojis, result['POS_EMOS'] + result['NEG_EMOS'])
        num_urls += result['URLS']
        max_urls = max(max_urls, result['URLS'])
Ejemplo n.º 31
0
                        metavar='Path',
                        help='Path to output scores')
    args = parser.parse_args()

    file_list = glob.glob(args.data_path + '*.csv')

    print(file_list)

    assert len(
        file_list
    ) > 1, 'Not enough files found in the specified folder. At least two files with score should be available in the folder.'

    score_files = []

    for score_file in file_list:
        score_files.append(parse_csv(score_file))

    out_data = [get_header(file_list[0])]

    classes = out_data[0][2:]

    idx_to_class = {}
    for i, clss in enumerate(classes):
        idx_to_class[str(i)] = clss

    with torch.no_grad():

        iterator = tqdm(score_files[0], total=len(score_files))
        for filename in iterator:

            out = 0.0
Ejemplo n.º 32
0
def predict_using_online_mode(logger,
                              ar_order,
                              ma_order,
                              with_c=True,
                              initial_history_size=5,
                              number_of_predictions_ahead=10,
                              lag_size=0,
                              start_params=None):
    # read series data
    # read input file - returns list of lists
    data_records = utils.parse_csv(DATASET_FILE_PATH,
                                   smoothing_level=1,
                                   should_shuffle=False)
    logger.log(
        'records loaded: {num_records}'.format(num_records=len(data_records)))

    # define min error storage
    model_error_metrics = dict()
    valid_samples_counter = 0

    # make predictions for each record
    logger.log('** ARMA settings: p={p}, q={q}'.format(p=ar_order, q=ma_order))
    start_time = time.time()
    for record_index in range(len(data_records)):
        if (record_index % LOGGING_INTERVAL) == 0:
            logger.log(
                '-- record #{record_index}'.format(record_index=record_index))

        # test sample size and split to train and test sets
        current_sample = data_records[record_index]
        if len(current_sample
               ) < initial_history_size + number_of_predictions_ahead:
            # logger.log('Not enough info in record. record size={record_size}'.format(record_size=len(current_sample)))
            continue

        train_set, test_set = \
            current_sample[:initial_history_size], \
            current_sample[initial_history_size:initial_history_size+number_of_predictions_ahead]

        # run ARMA model
        arma_model = ARMAModel(logger,
                               p=ar_order,
                               q=ma_order,
                               with_c=with_c,
                               lag_size=lag_size)

        predictions = '<not initialized>'
        try:
            arma_model.learn_model_params(train_set,
                                          start_params=np.array(start_params))

            if not IS_ONLINE:
                predictions = arma_model.predict_using_learned_params(
                    train_set, number_of_predictions_ahead)

            else:
                predictions = list()
                for i in range(number_of_predictions_ahead):
                    # predict next value
                    predicted_value = arma_model.predict_using_learned_params(
                        train_set, 1)

                    # store prediction
                    predictions.append(predicted_value[0])

                    # update model with test value
                    arma_model.update_model([test_set[i]])

            error_metrics = utils.get_all_metrics(test_set, predictions)

        except Exception as ex:
            if 'Not enough info' not in str(ex):
                logger.log(ex)
                # logger.log('series: {ser}'.format(ser=current_sample))
                # logger.log('predictions: {preds}'.format(preds=predictions))
            continue

        for metric_name in error_metrics.keys():
            if metric_name not in model_error_metrics.keys():
                model_error_metrics[metric_name] = list()
            model_error_metrics[metric_name].append(error_metrics[metric_name])
        valid_samples_counter += 1

    logger.log('total valid predictions: {valid_predictions}'.format(
        valid_predictions=valid_samples_counter))
    logger.log('total time: {total_secs} secs'.format(total_secs=time.time() -
                                                      start_time))

    return model_error_metrics
Ejemplo n.º 33
0
                    help='Flag to print results in a parser friendly format')
parser.add_argument('--normalize',
                    action='store_true',
                    help='Flag to normalize input data')
parser.add_argument('--mode',
                    type=str,
                    default='matrix',
                    help='Following input modes: matrix, middle, aggregate')

args = parser.parse_args()

mode = args.mode

normalize = False if not args.normalize else True
if args.train:
    trX, trY = parse_csv(args.train_file, num_hpc=12, normalize=normalize, mode=mode)

if args.testing:
    teX, teY = parse_csv(args.test_file, num_hpc=12, normalize=normalize, mode=mode)


# Network parameters
learning_rate = 0.001
reg_param = 0.01
dropout_prob = 0.5
training_epochs = 4
display_step = 1
std_pram = 1.0
num_input = len(trX[0][0]) if args.train else len(teX[0][0])
num_steps = len(trX[0]) if args.train else len(teX[0])
num_units = 15 if args.num_units == None else args.num_units
Ejemplo n.º 34
0
def analysis_page():
    # Methods
    survey_file, config = request.args.get("survey_file"), request.args.get(
        "config")
    if (not request.method == "POST") and (not survey_file and not config):
        return redirect(url_for("main"))
    # Checking for files
    do_analysis = False
    if request.method == "GET":
        if survey_file and config:
            do_analysis = True
    elif request.method == "POST" and (request.files["file"]
                                       and request.files["config"]):
        do_analysis = True

    # Do analysis
    if do_analysis:
        # Saving files
        if request.method == "POST":
            save = save_file(survey_file=request.files["file"],
                             config_file=request.files["config"])
            directory, filename, config_filename = (
                save["Directory"],
                save["File"],
                save["Config"],
            )
        else:
            directory, filename = os.path.split(survey_file)
            config_filename = os.path.basename(config)

        if filename.endswith(".xlsx"):
            questions = list(
                utils.parse_excel(os.path.join(directory, filename)).keys())
        else:
            questions = list(
                utils.parse_csv(os.path.join(directory, filename)).keys())
        types = utils.parse_config(os.path.join(directory, config_filename))

        # Excel but incomplete config
        if len(questions) != len(types):
            session["TEMP_FOLDER"] = directory
            predictor = utils.Predictor()
            qn_dict = {}
            for i, qn in enumerate(questions):
                if i + 1 not in types.keys():
                    datatype = predictor.predict([qn])
                    qn_dict[i + 1] = (qn, datatype[0])
                else:
                    qn_dict[i + 1] = (qn, types[i + 1])
            questions_index = [(i[0], i[1][0], i[1][1])
                               for i in qn_dict.items()]

            return render_template("config.html",
                                   questions=questions_index,
                                   error=None)

        # Start analysis
        try:
            session["ANALYSIS"] = analyse.analyse(directory, filename,
                                                  config_filename)
        except ValueError as e:
            return render_template(
                "error.html",
                error=
                "ValueError! Perhaps you chose a wrong category for your data",
                error_no="500",
                error_message=error_messages[500],
            )
        except Exception as e:
            return render_template(
                "error.html",
                error=f"Unknown error: {str(e)}",
                error_no="500",
                error_message=error_messages[500],
            )

        graphs, clouds, numerical = [], [], []
        for question, analysis in session["ANALYSIS"].items():
            if analysis:
                if analysis[0] == "categorical":
                    graphs.append([
                        question,
                        utils.pie(
                            question,
                            [x for x in analysis[1]["Percentages"].keys()],
                            [y for y in analysis[1]["Percentages"].values()],
                        ),
                    ])
                elif analysis[0] == "openended":
                    clouds.append([question, analysis[1]])
                elif analysis[0] == "numerical":
                    numerical.append([question, analysis[1]])

        graphs = tuple(utils.chunk(graphs, 3))
        clouds = tuple(utils.chunk(clouds, 2))
        numerical = tuple(utils.chunk(numerical, 4))

        return render_template(
            "analysis.html",
            graphs=graphs,
            clouds=clouds,
            numerical=numerical,
            filename=filename,
            path=os.path.split(directory)[1],
        )
    elif not request.files["file"]:  # No excel
        return render_template("upload.html", error="Missing Excel/CSV file!")

    elif request.files[
            "file"] and not request.files["config"]:  # Excel but no config
        save = save_file(survey_file=request.files["file"])
        directory, filename = save["Directory"], save["File"]
        session["TEMP_FOLDER"] = directory
        if filename.endswith(".xlsx"):
            questions = list(
                utils.parse_excel(os.path.join(directory, filename)).keys())
        else:
            questions = list(
                utils.parse_csv(os.path.join(directory, filename)).keys())
        predictions = utils.Predictor().predict(questions)
        questions_index = [(i + 1, question, predictions[i])
                           for i, question in enumerate(questions)]

        return render_template("config.html",
                               questions=questions_index,
                               error=None)
args = parser.parse_args()

mode = args.mode


# helper function
def debug(msg):
    if args.debug:
        print('DEBUG: {}'.format(msg))


normalize = False if not args.normalize else True
if args.train:
    trX, trY = parse_csv(args.train_file,
                         num_hpc=args.hpc,
                         normalize=normalize,
                         mode=mode)

if args.testing:
    teX, teY = parse_csv(args.test_file,
                         num_hpc=args.hpc,
                         normalize=normalize,
                         mode=mode)

# Network parameters
learning_rate = 0.001
reg_param = 0.0
noise_param_value = 1.0
dropout_prob = 1.0
training_epochs = 4
display_step = 1