Beispiel #1
0
def batch_to_file(batch, url, experiment_name, run, n_qu, n_lists, batch_n):

    # header = ['quid', 'question', 'example_pos', 'example_neg']
    header_new = ['quid','listNr', 'description', 'exampleTrue', 'exampleFalse',\
                  'triple', 'completionUrl', 'name']
    dirpath = f'../prolific_input/run{run}-group_{experiment_name}/'
    batch_name = f'qu{n_qu}-s_qu{n_lists}-batch{batch_n}'
    filepath = f'{dirpath}{batch_name}.csv'
    pl_name = f'Agree or disagree (run{run}-{experiment_name}-batch{batch_n}-{n_qu}-{n_qu})'

    ### write header###
    if not os.path.isdir(dirpath):
        os.mkdir(dirpath)
    header_path = f'{dirpath}header.txt'
    if not os.path.isfile(header_path):
        with open(header_path, 'w') as outfile:
            outfile.write(','.join(header_new))
    ###
    new_dicts = []
    for d in batch:
        triple = f"{d['relation']}-{d['property']}-{d['concept']}"
        new_d = dict()
        new_d['quid'] = d['quid']
        new_d['listNr'] = d['listNr']
        new_d['description'] = d['question']
        new_d['exampleTrue'] = d['example_pos']
        new_d['exampleFalse'] = d['example_neg']
        new_d['run'] = run
        new_d['subList'] = 1
        new_d['completionUrl'] = url
        new_d['triple'] = triple
        new_d['name'] = pl_name
        new_dicts.append(new_d)
    to_csv(filepath, new_dicts, header=True)
    return filepath
Beispiel #2
0
def results_to_csv(results, csv_file_name):
    def algorithm(r):
        if r.configs['Selection'] == 'ACO':
            return r.configs['Routing'] + '+' + r.configs[
                'Selection'] + '(' + str(
                    r.configs['AcoSelectionAlpha']) + ', ' + str(
                        r.configs['ReinforcementFactor']) + ')'
        else:
            return r.configs['Routing'] + '+' + r.configs['Selection']

    def benchmark(r):
        if r.configs['DataPacketTraffic'] == 'Trace':
            return r.props['bench']

        return r.configs['DataPacketTraffic']

    to_csv(csv_file_name, results, [
        ('Benchmark', benchmark),
        ('Routing', lambda r: r.configs['Routing']),
        ('Selection', lambda r: r.configs['Selection']),
        ('Data Packet Injection Rate',
         lambda r: r.configs['DataPacketInjectionRate']),
        ('ACO Selection Alpha', lambda r: r.configs['AcoSelectionAlpha']),
        ('Reinforcement Factor', lambda r: r.configs['ReinforcementFactor']),
        ('Algorithm', algorithm),
        ('Max Cycles', lambda r: r.configs['MaxCycles']),
        ('Simulation Time (Seconds)',
         lambda r: r.stats['SimulationTimeInSeconds']),
        ('Throughput', lambda r: r.stats['Throughput']),
        ('Average Packet Delay', lambda r: r.stats['AveragePacketDelay']),
        ('Payload Throughput', lambda r: r.stats['PayloadThroughput']),
        ('Average Payload Packet Delay',
         lambda r: r.stats['AveragePayloadPacketDelay']),
    ])
Beispiel #3
0
def run(args):
    l = loader.DataLoader(args.dataset, args.k, args.mode, args.dataset_path,
                          args.crowd_annotations_path, args.ground_truths_path)
    data, gt = l.get_data()
    result, accuracy = algorithms.main(args, data, gt)

    ind_to_question_dict = l.get_ind_to_question_dict()
    ind_to_annotation_dict = l.get_ind_to_annotation_dict()

    result_annotations = pd.DataFrame(data=result, columns=['Annotation'])
    result_annotations.reset_index(level=0, inplace=True)
    result_annotations = result_annotations.rename(
        columns={'index': 'Question'})

    result_annotations['Question'] = result_annotations['Question'].map(
        ind_to_question_dict)
    result_annotations['Annotation'] = result_annotations['Annotation'].map(
        ind_to_annotation_dict)

    if args.print_result:
        print("Predictions:")
        print(result_annotations)
        if args.mode == 'test':
            print("Accuracy:")
            print(accuracy)
    if args.output is not None:
        utils.to_csv(result, args.output, ind_to_question_dict,
                     ind_to_annotation_dict)
def main(p):

    train, test = utils.load(p)

    train['qqgeogor_jaccard-{}'.format(p)] = train.apply(
        lambda x: str_jaccard(x['q1'], x['q2']), axis=1)
    train['qqgeogor_levenshtein_1-{}'.format(p)] = train.apply(
        lambda x: str_levenshtein_1(x['q1'], x['q2']), axis=1)
    train['qqgeogor_levenshtein_2-{}'.format(p)] = train.apply(
        lambda x: str_levenshtein_2(x['q1'], x['q2']), axis=1)
    train['qqgeogor_sorensen-{}'.format(p)] = train.apply(
        lambda x: str_sorensen(x['q1'], x['q2']), axis=1)
    train['qqgeogor_set_intersection-{}'.format(p)] = train.apply(
        lambda x: calc_set_intersection(x['q1'], x['q2']), axis=1)

    test['qqgeogor_jaccard-{}'.format(p)] = test.apply(
        lambda x: str_jaccard(x['q1'], x['q2']), axis=1)
    test['qqgeogor_levenshtein_1-{}'.format(p)] = test.apply(
        lambda x: str_levenshtein_1(x['q1'], x['q2']), axis=1)
    test['qqgeogor_levenshtein_2-{}'.format(p)] = test.apply(
        lambda x: str_levenshtein_2(x['q1'], x['q2']), axis=1)
    test['qqgeogor_sorensen-{}'.format(p)] = test.apply(
        lambda x: str_sorensen(x['q1'], x['q2']), axis=1)
    test['qqgeogor_set_intersection-{}'.format(p)] = test.apply(
        lambda x: calc_set_intersection(x['q1'], x['q2']), axis=1)

    utils.to_csv(train, test, 'f103-{}'.format(p))

    return
def add_new_example_props():
    # Get property info
    path = '../data/property_info.csv'
    prop_dicts = read_csv(path)
    props_in_info = [d['property'] for d in prop_dicts]
    header = prop_dicts[0].keys()

    # Get example properties
    ex_files = glob.glob('../examples/*-pairs.csv')
    test_files = glob.glob('../data/test/*/*.csv')
    ex_files.extend(test_files)

    p_targets = ['prop_pos', 'prop_neg', 'property']
    for f in ex_files:
        with open(f) as infile:
            dl = read_csv(f)
        for d in dl:
            for t in p_targets:
                if t in d:
                    prop = d[t]
                    if prop != '' and prop not in props_in_info:
                        print(f'"{prop}" needs annotation!')
                        new_d = dict()
                        new_d['property'] = prop
                        for h in header:
                            if h not in new_d:
                                new_d[h] = 'NEEDS INFO'
                        if new_d not in prop_dicts:
                            prop_dicts.append(new_d)
    print(f'Add info to added properties in: {path}')
    to_csv(path, prop_dicts)
Beispiel #6
0
def main(args):
    start = time.time()
    input_train = pnd.read_csv(utils.get_corr_lemm_path(args.label))
    input_test = pnd.read_csv(utils.get_corr_lemm_path(args.label, test=True))
    y = pnd.read_csv(utils.get_labels_path(),
                     sep=';')[params.LABELS_COL].values
    X_train, X_val, y_train, y_val = train_test_split(input_train,
                                                      y,
                                                      test_size=0.2,
                                                      random_state=42)
    tokenizer = Tokenizer()
    vectorizer = MyVectorizer(is_sparse=True)
    pca = TruncatedSVD()
    svm = SVC(random_state=42)
    # Caching operations to avoid repetitions
    cachedir = mkdtemp()
    pipe = Pipeline(
        [
            ('tokenizer', tokenizer),
            ('vectorizer', vectorizer),
            # ('pca', pca),
            ('svm', svm)
        ],
        memory=cachedir)

    params_grid = dict(
        tokenizer__do_clustering=[True],
        tokenizer__n_clusters=[2, 3, 4, 5],
        tokenizer__max_df=[1.],
        vectorizer__max_df=[1.],
        vectorizer__max_features=[None],
        svm__C=[40.],
        svm__gamma=[0.05],
        svm__kernel=['rbf'],
        # pca__n_components=[1000, 2000, 3000, 4000],
    )
    params_pipe = dict(
        tokenizer__do_clustering=True,
        tokenizer__n_clusters=2,  # , 5, 20],
        tokenizer__max_df=1.,  # , 0.2],
        vectorizer__max_df=1.,  # , 0.05],
        vectorizer__max_features=None,
        svm__C=40.,  # , 100., 10., 1.],
        svm__gamma=0.05,  # , 0.05, 0.1],
        svm__kernel='rbf',
        # pca__n_components=3000,
    )
    pipe.set_params(**params_pipe)
    pipe.fit(input_train, y)
    print(pipe.score(X_val, y_val))
    predictions = pipe.predict(input_test)
    utils.to_csv(
        predictions, './results/%s/svm/y_pred.csv' %
        datetime.strftime(datetime.now(), "%Y_%m_%dT%H_%M_%S"))

    # grid_search = GridSearchCV(pipe, n_jobs=3, cv=3, param_grid=params_grid, verbose=3)
    # grid_search.fit(X_train, y_train)
    # utils.write_results(grid_search)
    print("It took %.3f" % (time.time() - start))
 def to_file(self, overwrite_existing=True):
     # filepath = 'questions/run_TEST-all-restrict_True.csv'
     filepath = f"../questions/run{self.run}-all-restricted_{self.restrict}.csv"
     if os.path.isfile(filepath) and overwrite_existing == False:
         print('ATTENTION: run already exists. If you want to overwrite, set overwrite_exsiting to True.')
     else:
         utils.to_csv(filepath, self.questions)
         print(f'{len(self.questions)} questions written to: {filepath}')
Beispiel #8
0
def multi(p):
    train_ = train.copy()
    test_  = test.copy()
    ix = list(range(0,3000,300))[p]
    words_ = words[ix:ix+300]
    
    for w in words_:
        train_['BOW_'+w] = train_['q1'].map(lambda x: w in x.split())*1 + train_['q2'].map(lambda x: w in x.split())*1
        test_['BOW_'+w]  = test_['q1'].map(lambda x: w in x.split())*1 + test_['q2'].map(lambda x: w in x.split())*1

    utils.to_csv(train_, test_, 'f009-word-{0}'.format(p))
Beispiel #9
0
def correct_check(path, question_dicts):
    path_backup = path.replace('.csv', '-backup.csv')
    to_csv(path_backup, question_dicts)
    for d in question_dicts:
        if d['relation'] == (
                'affording_activity') and d['property'].startswith('made_of'):
            print(d['question'])
            d['question'] = d['question'].replace('I know that being (a/an)',
                                                  'I know that being')
            print(d['question'])
    to_csv(path, question_dicts)
Beispiel #10
0
def add_quot_marks(question_dicts, path, rel='creative'):
    relevant_keys = ['question', 'example_pos', 'example_neg']
    for d in question_dicts:
        if d['relation'] == rel:
            for k in relevant_keys:
                phrase = d[k]
                print(phrase)
                phrase = phrase.replace('say (a/an)', 'say ``(a/an)')
                phrase = phrase.replace(', but I', '", but I')
                print(phrase)
                d[k] = phrase
    to_csv(path, question_dicts)
Beispiel #11
0
    def get_tfidf_values(self, page_id_list, term_id_list):
        """
        Returns a list of (PageID, TermID, Tfidf)
        """
        v1 = to_csv(page_id_list)
        v2 = to_csv(term_id_list)

        self._cur.execute("""
            SELECT PageID, TermID, Tfidf
            FROM TfidfValues
            WHERE PageID IN (%s) AND TermID IN (%s);
        """ % (v1, v2))
        return self._cur.fetchall()
    def get_term_occurrences(self, page_id_list, term_id_list):
        """
        Returns a list of (PageID, TermID, Counter)
        """
        v1 = to_csv(page_id_list)
        v2 = to_csv(term_id_list)

        self._cur.execute("""
            SELECT PageID, TermID, Counter
            FROM TermOccurrences
            WHERE PageID IN (%s) AND TermID IN (%s);
        """ % (v1, v2))
        return self._cur.fetchall()
Beispiel #13
0
    def get_term_occurrences(self, page_id_list, term_id_list):
        """
        Returns a list of (PageID, TermID, Counter)
        """
        v1 = to_csv(page_id_list)
        v2 = to_csv(term_id_list)

        self._cur.execute("""
            SELECT PageID, TermID, Counter
            FROM TermOccurrences
            WHERE PageID IN (%s) AND TermID IN (%s);
        """ % (v1, v2))
        return self._cur.fetchall()
    def get_tfidf_values(self, page_id_list, term_id_list):
        """
        Returns a list of (PageID, TermID, Tfidf)
        """
        v1 = to_csv(page_id_list)
        v2 = to_csv(term_id_list)

        self._cur.execute("""
            SELECT PageID, TermID, Tfidf
            FROM TfidfValues
            WHERE PageID IN (%s) AND TermID IN (%s);
        """ % (v1, v2))
        return self._cur.fetchall()
Beispiel #15
0
def update_examples(question_dicts, label, ex_dict, rel):
    for d in question_dicts:
        if d[f'concept_{label}'] == ex_dict['concept_old'] \
        and d[f'prop_{label}'] == ex_dict['prop_old'] and d['relation'] == rel:
            ex = d[f'example_{label}']
            print('old', ex)
            d[f'concept_{label}'] = ex_dict['concept_new']
            d[f'prop_{label}'] = ex_dict['prop_new']
            new_ex = ex.replace(ex_dict['concept_old'], ex_dict['concept_new'])
            new_ex = new_ex.replace(ex_dict['prop_old'], ex_dict['prop_new'])
            if new_ex[-1] != '.':
                new_ex = new_ex + '.'
            d[f'example_{label}'] = new_ex
            print('new', d[f'example_{label}'])
    to_csv(path, question_dicts)
Beispiel #16
0
 def get_page_data(self, page_id_list):
     """
     Returns a list of (PageID, PageName, Length)
     """
     var_string = to_csv(page_id_list)
     self._cur.execute("""
         SELECT PageID, PageName, Length
         FROM Pages
         WHERE PageID IN (%s);
     """ % var_string)
     return self._cur.fetchall()
 def get_page_data(self, page_id_list):
     """
     Returns a list of (PageID, PageName, Length)
     """
     var_string = to_csv(page_id_list)
     self._cur.execute("""
         SELECT PageID, PageName, Length
         FROM Pages
         WHERE PageID IN (%s);
     """ % var_string)
     return self._cur.fetchall()
    def get_document_frequencies(self, term_id_list):
        """
        Returns a list of (TermID, DocumentFrequency)
        """
        var_string = to_csv(term_id_list)

        self._cur.execute("""
            SELECT TermID, DocumentFrequency
            FROM DocumentFrequencies
            WHERE TermID IN (%s)
        """ % var_string)
        return self._cur.fetchall()
    def get_tfidf_totals(self, page_id_list):
        """
        Returns a list of (PageID, Total)
        """
        var_string = to_csv(page_id_list)

        self._cur.execute("""
            SELECT PageID, Total
            FROM TfidfTotals
            WHERE PageID IN (%s);
        """ % var_string)
        return self._cur.fetchall()
    def get_term_names(self, term_id_list):
        """
        Returns a list of (TermID, TermName)
        """
        var_string = to_csv(term_id_list)

        self._cur.execute("""
            SELECT TermID, TermName
            FROM Terms
            WHERE TermID IN (%s);
        """ % var_string)
        return self._cur.fetchall()
    def get_page_ids(self, page_name_list):
        """
        Returns a list of (PageName, PageID)
        """
        var_string = to_csv(page_name_list)

        self._cur.execute("""
            SELECT PageName, PageID
            FROM Pages
            WHERE PageName IN (%s);
        """ % var_string)
        return self._cur.fetchall()
def get_relations(run):
    dicts = []
    filepath = f'../templates/relation_overview_run{run}.csv'
    collection_relation_question_dict, level_relation_dict = read_template(run)
    for l, rels, in level_relation_dict.items():
        l = int(l)
        if l == 1:
            l_name = 'all'
        elif l == 2:
            l_name = 'some'
        elif l == 3:
            l_name = 'few'
        for r in rels:
            d = dict()
            if r == 'creative':
                 d['level'] = 'creative'
            else:
                d['level'] = l_name
            d['relation'] = r
            dicts.append(d)
    to_csv(filepath, dicts, header = True)
Beispiel #23
0
    def get_document_frequencies(self, term_id_list):
        """
        Returns a list of (TermID, DocumentFrequency)
        """
        var_string = to_csv(term_id_list)

        self._cur.execute("""
            SELECT TermID, DocumentFrequency
            FROM DocumentFrequencies
            WHERE TermID IN (%s)
        """ % var_string)
        return self._cur.fetchall()
Beispiel #24
0
    def get_page_ids(self, page_name_list):
        """
        Returns a list of (PageName, PageID)
        """
        var_string = to_csv(page_name_list)

        self._cur.execute("""
            SELECT PageName, PageID
            FROM Pages
            WHERE PageName IN (%s);
        """ % var_string)
        return self._cur.fetchall()
Beispiel #25
0
    def get_tfidf_totals(self, page_id_list):
        """
        Returns a list of (PageID, Total)
        """
        var_string = to_csv(page_id_list)

        self._cur.execute("""
            SELECT PageID, Total
            FROM TfidfTotals
            WHERE PageID IN (%s);
        """ % var_string)
        return self._cur.fetchall()
Beispiel #26
0
    def get_term_names(self, term_id_list):
        """
        Returns a list of (TermID, TermName)
        """
        var_string = to_csv(term_id_list)

        self._cur.execute("""
            SELECT TermID, TermName
            FROM Terms
            WHERE TermID IN (%s);
        """ % var_string)
        return self._cur.fetchall()
Beispiel #27
0
def lambda_handler(event, context):
    "Lambda entry point"
    with open('config.json') as data_file:
        CONFIG = json.load(data_file)
    #Read the task completion sheet and filter current quarter data
    spreadsheet_data, headers = utils.read_sheet()
    filtered_data = utils.filter_current_quarter_data(spreadsheet_data)
    #Read the data into a csv
    complete_data = utils.to_csv(spreadsheet_data, headers)
    current_quarter_data = utils.to_csv(filtered_data, headers)
    #Upload the csv files to s3 bucket
    utils.upload_to_s3(complete_data, CONFIG['complete_sheet_s3_key'])
    utils.upload_to_s3(current_quarter_data, CONFIG['current_quarter_s3_key'])
    #Prepare the data to initiate transfer to dynamodb
    prepared_complete_data = utils.prepare_data(
        CONFIG['complete_sheet_s3_key'])
    prepared_quarter_data = utils.prepare_data(
        CONFIG['current_quarter_s3_key'])
    #Store the complete task compeltion sheet data if the dynamodb is empty
    utils.migrate_to_dynamodb(prepared_complete_data)
    #Update the dynamodb with edits to current quarter data
    utils.update_dynamodb(prepared_quarter_data)
    return "Read task completion sheet and populated dynamodb"
Beispiel #28
0
    def get_page_links(self, page_id_list):
        """
        Returns a list of (PageID, TargetPageID, LinkCounter)
        """
        # Attempting to speed this up with a TargetID IN will
        # not work because there is no Index available on TargetID
        var_string = to_csv(page_id_list)

        self._cur.execute("""
            SELECT PageID, TargetPageID, Counter
            FROM PageLinks
            WHERE PageID IN (%s);
        """ % (var_string, ))

        return self._cur.fetchall()
    def get_page_links(self, page_id_list):
        """
        Returns a list of (PageID, TargetPageID, LinkCounter)
        """
        # Attempting to speed this up with a TargetID IN will
        # not work because there is no Index available on TargetID
        var_string = to_csv(page_id_list)

        self._cur.execute("""
            SELECT PageID, TargetPageID, Counter
            FROM PageLinks
            WHERE PageID IN (%s);
        """ % (var_string, ))

        return self._cur.fetchall()
def fetch_and_compose(sha_issuekey):
    sha, issue_key = sha_issuekey
    try:
        res = fetch_issue(issue_key)
    except:
        # ignore communication failure
        return
    if not RAW:
        res = get_filtered(res)
        res['commit'] = sha
        if CSV:
            res = utils.to_csv([v for _, v in sorted(res.items())])
        else:
            res = json.dumps(res, sort_keys=True, separators=(',', ':'))
    utils.output(res)
    di = {c:c+suf for c in df.columns if '_from_the1owl' in c}
    df = df.rename(columns=di)
    
    col = [c for c in df.columns if '_from_the1owl' in c or 'id' in c]
    
    
    return df[col]



#==============================================================================

train, test = utils.load(0)
train = main(train, '').fillna(-1)
test  = main(test, '').fillna(-1)
utils.to_csv(train, test, 'f102-0')

train, test = utils.load(1)
train = main(train, '-stem').fillna(-1)
test  = main(test, '-stem').fillna(-1)
utils.to_csv(train, test, 'f102-1')

train, test = utils.load(2)
train = main(train, '-stop').fillna(-1)
test  = main(test, '-stop').fillna(-1)
utils.to_csv(train, test, 'f102-2')

train, test = utils.load(3)
train = main(train, '-stst').fillna(-1)
test  = main(test, '-stst').fillna(-1)
utils.to_csv(train, test, 'f102-3')
Beispiel #32
0
    df['q2_large_share_ratio'] = df['q2_large_share'] / df['q2_large_len']

    col = df.dtypes[df.dtypes != 'object'].index.tolist()

    return df[col]


"""

df = train.sample(999)





"""
#==============================================================================
# main
#==============================================================================

train = main(train)
test = main(test)

utils.to_csv(train, test, 'f008')

print(
    """#==============================================================================
# SUCCESS !!! {}
#==============================================================================
""".format(__file__))
def add_page_index(terms, page, intra_links):
    term_list = Counter(terms)
    doc_length = sum(term_list.values())

    if doc_length >= MIN_PAGE_LENGTH:
        cur.execute("""
            SELECT PageID, Processed
            FROM Pages
            WHERE PageName=%s;
        """, (page, ))
        rows = cur.fetchone()
        cur.fetchall()

        if rows:
            page_id, processed = rows
            if processed:
                return  # No decent way to resolve this conflict (Issue #76)
            else:
                cur.execute("""
                    UPDATE Pages
                    SET PageName=%s, Length=%s, Processed=TRUE
                    WHERE PageID=%s;
                """, (page, doc_length, page_id))
        else:
            # On duplicate command is a hack to prevent a select statement
            cur.execute("""
                INSERT IGNORE INTO Pages (PageName, Length, Processed)
                VALUES (%s, %s, TRUE)
            """, (page, doc_length))
            page_id = cur.lastrowid

            # There was a duplicate entry case
            if page_id <= 0:
                return

        filtered_term_list = [a for (a, b) in term_list.items() if b > 1]
        var_string = to_csv(filtered_term_list, separate=True)
        cur.execute("""
            INSERT IGNORE INTO Terms (TermName)
            VALUES %s;
        """ % var_string)

        var_string = to_csv(filtered_term_list, separate=False)
        cur.execute("""
            SELECT TermID, TermName FROM Terms
            WHERE TermName IN (%s);
        """ % var_string)

        term_results = cur.fetchall()

        if term_results:
            termids = [(tid, term_list[name]) for (tid, name) in term_results]

            var_string = u'({},%s,%s),'.format(page_id) * len(term_results)
            var_string = var_string[:-1]

            cur.execute("""
                INSERT INTO TermOccurrencesTemp (PageID, TermID, Counter)
                VALUES %s;
            """ % var_string, itertools.chain.from_iterable(termids))

        var_string = u''
        page_links = {}

        for link, counter in intra_links.items():
            cur.execute("""
                INSERT INTO Pages (PageName, Processed)
                VALUES (%s, FALSE)
                ON DUPLICATE KEY UPDATE PageID=LAST_INSERT_ID(PageID);
            """, (link, ))

            # Handles conflicts gracefully using a dictionary
            target_page_id = cur.lastrowid
            if target_page_id not in page_links:
                page_links[target_page_id] = 0

            page_links[target_page_id] += counter

        # Generate the link pairs from the built dictionary
        for target_page_id, counter in page_links.items():
            var_string += u'(%d,%d,%d),' % (page_id, target_page_id, counter)

        # Perform one large batch insert rather than individual inserts
        if var_string:
            var_string = var_string[:-1]
            cur.execute("""
                INSERT INTO PageLinks (PageID, TargetPageID, Counter)
                VALUES %s;
            """ % var_string)
Beispiel #34
0
def opinions_per_month_per_city():
    return session.query(func.date_part('month',Opinion.date),City.name, func.count('*')).select_from(Opinion).join(Hotel).join(Address).join(City).group_by(City.name, func.date_part('month',Opinion.date)).all()


def make_histogram(iterable,low,high,bins,shift):
    step = (high - low + 0.0) / bins
    dist = Counter((float(x) - low + shift) // step for x in iterable)
    return [dist[b] for b in range(bins)]


subq1 = session.query(City.name,City.id,func.count(Hotel.id).label('count')).select_from(Opinion).join(Hotel).join(Address).join(City).group_by(City.id,Hotel.id).subquery()
subq2 = session.query(Hotel.stars,func.count(Hotel.id).label('count')).select_from(Opinion).join(Hotel).group_by(Hotel.id).subquery()

result = opinions_per_tag()
utils.to_csv(result, "opinions_per_tag.out")

result = hotel_dist(Hotel.stars)
utils.to_csv(result, "hotel_dist_stars.out")

result = hotel_dist(Hotel.price_level)
utils.to_csv(result, "hotel_dist_price.out")

result = avg_ops(subq1.c.name,subq1)
utils.to_csv(result, "avg_ops_hotel_cities.out")

result = avg_ops(subq2.c.stars,subq2)
utils.to_csv(result, "avg_ops_hotel_stars.out")

result = city_hotel_dist(Hotel.stars)
utils.to_csv(result, "city_hotel_stars.out")
Beispiel #35
0
        lambda x: fuzz.partial_token_sort_ratio(str(x['q1']), str(x['q2'])),
        axis=1)
    df['fuzz_token_set_ratio' + suf] = df.apply(
        lambda x: fuzz.token_set_ratio(str(x['q1']), str(x['q2'])), axis=1)
    df['fuzz_token_sort_ratio' + suf] = df.apply(
        lambda x: fuzz.token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1)

    return df


#==============================================================================

train, test = utils.load(0)
train = main(train, '')
test = main(test, '')
utils.to_csv(train, test, 'f100-0')
del train, test
gc.collect()

train, test = utils.load(1)
train = main(train, '-stem')
test = main(test, '-stem')
utils.to_csv(train, test, 'f100-1')
del train, test
gc.collect()

train, test = utils.load(2)
train = main(train, '-stop')
test = main(test, '-stop')
utils.to_csv(train, test, 'f100-2')
del train, test
 def print_top_terms(clf_descr, category, cpairs):
     to_csv(os.path.join(opts.output_dir,
                         "feat_%s_%s.csv" %
                         (clf_descr, category)), [(re.sub('^[A-Za-z]+__', '', t).encode("utf-8"), c) for t, c in cpairs])
    def fit(self,
            ratings: np.ndarray,
            validset: np.ndarray = None,
            epochs: int = 20) -> None:
        """
        Arguments
        ---------
        ratings : np.ndarray
            ratings matrix for training (i.e. train dataset)
        validset : np.ndarray
            validation dataset
        epochs : int
            number of iterations
        """

        # average of ratings
        self.mean_rating = np.mean(ratings[:, 2])
        # minimum of ratings
        if self.min_rating > np.min(ratings[:, 2]):
            self.min_rating = np.min(ratings[:, 2])

        best_loss = 0

        # initialize user gradient & momentum
        user_feature_grads, user_feature_mom = (
            np.zeros((self.n_user, self.n_feature)),
            np.zeros((self.n_user, self.n_feature)),
        )
        # initialize item gradient & momentum
        item_feature_grads, item_feature_mom = (
            np.zeros((self.n_item, self.n_feature)),
            np.zeros((self.n_item, self.n_feature)),
        )

        batch_num = int(np.ceil(ratings.shape[0] / self.batch_size))
        start_time = time.time()
        self.train_losses = []
        self.valid_losses = []
        for epoch in range(1, epochs + 1):
            # dataset shuffling
            np.random.shuffle(ratings)

            for batch_idx in range(batch_num):
                start_idx = int(batch_idx * self.batch_size)
                end_idx = int((batch_idx + 1) * self.batch_size)
                batch = ratings[start_idx:end_idx]

                # compute gradient
                user_ids = batch.take(0, axis=1).astype(int)
                item_ids = batch.take(1, axis=1).astype(int)
                u_features = self.user_features.take(user_ids, axis=0)
                i_features = self.item_features.take(item_ids, axis=0)

                outputs = np.sum(u_features * i_features, axis=1)
                errs = outputs - (batch.take(2, axis=1) - self.mean_rating)

                err_mat = np.tile(2 * errs, (self.n_feature, 1)).T
                user_grads = i_features * err_mat + self.reg * u_features
                item_grads = u_features * err_mat + self.reg * i_features

                # clear all gradients
                user_feature_grads.fill(0.0)
                item_feature_grads.fill(0.0)
                for idx in range(batch.shape[0]):
                    user_id, item_id, rating = batch[idx]
                    user_id, item_id = int(user_id), int(item_id)

                    user_feature_grads[user_id, :] += user_grads[idx]
                    item_feature_grads[item_id, :] += item_grads[idx]

                # update momentum
                user_feature_mom = (self.momentum * user_feature_mom +
                                    self.learning_rate * user_feature_grads)
                item_feature_mom = (self.momentum * item_feature_mom +
                                    self.learning_rate * item_feature_grads)
                # update user/item matrix
                self.user_features -= user_feature_mom
                self.item_features -= item_feature_mom

            # rmse train loss
            train_preds = self.predict(ratings[:, :2])
            train_loss = rmse(train_preds, ratings[:, 2])
            self.train_losses.append(train_loss)

            # save losses
            if validset is None:
                print(
                    f"ellapse: {time_since(start_time)} | epoch: {epoch:03d} | train RMSE: {train_loss:.6f}"
                )

            else:
                valid_preds = self.predict(validset[:, :2])
                valid_loss = rmse(valid_preds, validset[:, 2])
                self.valid_losses.append(valid_loss)
                print(
                    f"ellapse: {time_since(start_time)} | epoch: {epoch:03d} | train RMSE: {train_loss:.6f} | valid RMSE: {valid_loss:.6f}"
                )

                # save csv
                if best_loss == 0 or valid_loss < best_loss:
                    best_loss = valid_loss

                    result_dir = f"{self.save_dir}/results"
                    weight_dir = f"{self.save_dir}/weights"

                    to_csv(f"{result_dir}/output_val_pmf.csv",
                           valid_preds,
                           validset,
                           header=True)
                    self._save_model(weight_dir)
                    print(f"save result and model weights at {best_loss}")

        return None
Beispiel #38
0
    return df


#==============================================================================

model = gensim.models.KeyedVectors.load_word2vec_format(
    '../nlp_source/GoogleNews-vectors-negative300.bin.gz', binary=True)

norm_model = gensim.models.KeyedVectors.load_word2vec_format(
    '../nlp_source/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)

train, test = utils.load(0)
train = main(train, '')
test = main(test, '')
utils.to_csv(train, test, 'f003-0')
del train, test
gc.collect()

train, test = utils.load(1)
train = main(train, '-stem')
test = main(test, '-stem')
utils.to_csv(train, test, 'f003-1')
del train, test
gc.collect()

train, test = utils.load(2)
train = main(train, '-stop')
test = main(test, '-stop')
utils.to_csv(train, test, 'f003-2')
del train, test
        SGDClassifier(loss='log', alpha=1e-4, n_iter=50, penalty=penalty),
        "SGD_" + penalty.upper() + "_std"))

# print('=' * 80)
# print("SGD with elasticnet penalty")
# results.append(benchmark(
#     SGDClassifier(loss='hinge', alpha=1e-4, n_iter=50, penalty='elasticnet',
#                   l1_ratio=0.10),
#     "SGD (elasticnet penalty)"))

print('=' * 80)
print("SGD L1 feature selection")
clf = with_l1_feature_selection(
    SGDClassifier, loss='log', alpha=0.00021, n_iter=10
    )(loss='log', alpha=.0001, n_iter=50)
results.append(benchmark(clf, "SGD_L1_featsel"))


# print('=' * 80)
# print("Radial kernal svc")
# results.append(benchmark(SVC(kernel='rbf')))


if opts.output_roc:
    print("Writing ROC curve data")
    to_csv(opts.output_roc, all_roc_data)

if opts.output:
    print("Writing scores data")
    to_csv(opts.output, results)
Beispiel #40
0
    utils.to_csv(train_, test_, 'f009-word-{0}'.format(p))



pool = mp.Pool(total_proc)
callback = pool.map(multi, range(total_proc))

#==============================================================================
# ents
#==============================================================================

train, test = utils.load(3)

files = sorted(glob('../nlp_source/ent*'))

for f in files:
    words = pd.read_csv(f).head(30).word.tolist()
    for w in words:
        train['ent_'+w] = train['q1'].map(lambda x: w.lower() in x.lower().split())*1 + train['q2'].map(lambda x: w.lower() in x.lower().split())*1
        test['ent_'+w]  = test['q1'].map(lambda x: w.lower() in x.lower().split())*1 + test['q2'].map(lambda x: w.lower() in x.lower().split())*1

utils.to_csv(train, test, 'f009-ent')


print("""#==============================================================================
# SUCCESS !!! {}
#==============================================================================
""".format(__file__))

Beispiel #41
0
def get_basic_csv():
    """获取首页基础信息"""
    servants = get_basic_info_of_servants(as_raw=True)
    to_csv('data/servants.csv', servants)
Beispiel #42
0
def update_log(new_log_dict):
    path = '../task_set_up/experiment_log.csv'
    log_dicts = read_csv(path)
    log_dicts.append(new_log_dict)
    to_csv(path, log_dicts)
    print(f'updated log: {path}')