Exemple #1
0
def main(training_file, test_file, submission_file, ratio):
    data = utilities.read_file(training_file)
    test_data = utilities.read_file(test_file)

    print 'Preparing data...'
    x, y = preprocess.prepare_data(data)
    refid, x_test = preprocess.prepare_test_data(test_data)
    x, x_test = preprocess.preprocess_features(x, x_test)

    print 'Feature extracting...'
    x, x_test = feature_extraction.create_feature(x, y, x_test)

    indices = feature_extraction.get_best_k_feature_indices(x, y, 300)
    x = feature_extraction.get_best_k_features(x, indices)
    x_test = feature_extraction.get_best_k_features(x_test, indices)
    print 'Get %s features.' % len(x[0])

    x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
        x, y, test_size=.3, random_state=0)
    x_train, y_train = preprocess.down_sample(x_train, y_train, ratio)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)

    print 'Predicting...'
    predict = clf.predict_proba(x_test)
    utilities.write_submission_file(submission_file, refid, predict)
Exemple #2
0
def main(training_file, test_file, submission_file, ratio):
    data = utilities.read_file(training_file)
    test_data = utilities.read_file(test_file)

    print 'Preparing data...'
    x, y = preprocess.prepare_data(data)
    refid, x_test = preprocess.prepare_test_data(test_data)
    x, x_test = preprocess.preprocess_features(x, x_test)

    print 'Feature extracting...'
    x, x_test = feature_extraction.create_feature(x, y, x_test)

    indices = feature_extraction.get_best_k_feature_indices(x, y, 300)
    x = feature_extraction.get_best_k_features(x, indices)
    x_test = feature_extraction.get_best_k_features(x_test, indices)
    print 'Get %s features.' % len(x[0])

    x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
        x, y, test_size=.3, random_state=0)
    x_train, y_train = preprocess.down_sample(x_train, y_train, ratio)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)

    print 'Predicting...'
    predict = clf.predict_proba(x_test)
    utilities.write_submission_file(submission_file, refid, predict)
def correct_ADV():
    contents = read_file('data/new-ADV.txt', strip=True, dict_format=True)
    entries = read_file('data/cebposdict-nc.txt', strip=True, dict_format=True)

    for key, value in contents.iteritems():
        if 'REM' in value:
            if key in entries:
                del entries[key]
        else:
            entries[key] = value

    if len(contents):
        write_file('data/cebposdict-nc.txt',
                   contents=[''],
                   add_newline=False,
                   mode='w')
        for key, value in sorted(entries.iteritems()):
            new_entry = [key + ' ']
            value = list(set(value))
            new_entry.append(' '.join(value))
            new_entry.append('\n')
            write_file('data/cebposdict-nc.txt',
                       contents=new_entry,
                       add_newline=False,
                       mode='a')
            new_entry = []
Exemple #4
0
def scrape_news_contents():
    checkpoint = read_file("data/scraped/cp/news-links-cp.txt")
    start = int(checkpoint[0])
    if start == 501:
        print("Status: Finished!")
        return

    urls = read_file("data/scraped/news-links.txt", start=start)
    contents = []
    for idx, url in enumerate(urls):
        start += 1
        print("Link [" + str(start) + "]: " + url)
        page = urlopen(url)
        soup = BeautifulSoup(page, 'html.parser')
        div = soup.find('div', {
            'class': 'field-item even',
            'property': 'content:encoded'
        })
        for child in div.findChildren():
            contents.append(child.getText())
        write_file("data/scraped/news-raw-nc.txt",
                   contents=contents,
                   per_line=True,
                   mode="a")
        contents = []
        endpoints = [str(start + 1)]

        write_file("data/scraped/cp/news-links-cp.txt",
                   contents=endpoints,
                   mode="w")
def extract_features_multiprocess(ast_directory, features_directory, feature_type_to_extract, js_keywords_file, cpu_to_relax):
    all_files = utilities.get_files_in_a_directory(ast_directory)
    temp = utilities.read_file(js_keywords_file)

    keywords_list = []
    for item in temp:
        keywords_list.append(item.strip())

    raw_extracted_files = utilities.get_files_in_a_directory(features_directory)
    completed_feature_files = []
    
    for unpacked in raw_extracted_files:
        completed_feature_files.append(unpacked.split('/')[-1].split('.')[0])
    
    raw_extracted_files = []
    remaining_unpacked_files = []

    if os.path.isfile('ast_parsing.log'):
        raw_processed = utilities.read_file('ast_parsing.log')
    else:
        raw_processed = []

    processed_log = set()
    for item in raw_processed:
        processed_log.add(item.split(' ')[0].split('.')[0])

    raw_processed = []

    for unpacked in all_files:
        if unpacked.split('/')[-1].split('.')[0] in completed_feature_files: # or unpacked.split('/')[-1].split('.')[0] in processed_log:
            continue
        else:
            remaining_unpacked_files.append(unpacked)
    
    completed_feature_files = []
    all_files = []
    processed_log = set()

    print(len(remaining_unpacked_files), 'files to process')
    pool = ThreadPool(processes=multiprocessing.cpu_count() - cpu_to_relax)

    try:
        if feature_type_to_extract == ALL:
            results = pool.starmap(new_walk, zip(all_files, itertools.repeat(features_directory)))
        elif feature_type_to_extract == NO_NAMES:
            results = pool.starmap(new_walk_no_names, zip(all_files, itertools.repeat(features_directory)))
        elif feature_type_to_extract == KEYWORD:
            results = pool.starmap(new_walk_reserved_words, zip(remaining_unpacked_files, itertools.repeat(features_directory), itertools.repeat(keywords_list)))
        
        # for f_name in all_files:
        #     new_walk_reserved_words(f_name, features_directory, keywords_list)
        # utilities.append_list(f_name.replace(directory_path,result_directory).replace('json', 'txt'), raw_features)
    except Exception as e:
        print ('Exception in main thread: ', str(e))

    pool.close()
    pool.join()

    return
    def _get_previous_version(self):
        """
        Get current installed mysql version from cache file
        """
        if os.path.exists(self.CACHE_VERSION_FILE):
            read_file(self.CACHE_VERSION_FILE)

        return None
Exemple #7
0
    def _get_previous_version(self):
        """
        Get current installed mysql version from cache file
        """
        if os.path.exists(self.CACHE_VERSION_FILE):
            read_file(self.CACHE_VERSION_FILE)

        return None
Exemple #8
0
    def post(self):
        start_time = time.time()
        args = self.parser.parse_args()

        # read data
        params = read_params(args['params'].stream)
        df = read_file(args['raw_data'].stream.read())
        y_train = read_file(args['labels'].stream.read())

        # build features
        X_train = build_features(df, params)
        y_train = y_train.set_index('example_id')
        y_train = y_train.loc[X_train.index]

        # train model
        cl = train_model(X_train, y_train.label, params)
        self.model_factory.add_pipeline(cl, params)
        if isinstance(cl, tpot.TPOTClassifier):
            final_classifier = cl.fitted_pipeline_
            evaluated_indivs = cl.evaluated_individuals_
        else:
            final_classifier = cl
            evaluated_indivs = None
        model_type = str(final_classifier)
        mean_accuracy, mean_roc_auc = cross_validate(final_classifier, X_train,
                                                     y_train.label)

        # format feat_eng_params
        feat_eng_params = params['extract_features'].copy()
        for k in feat_eng_params.keys():
            if k == 'default_fc_parameters':  # shows calculations like min, mean, etc.
                feat_eng_params[k] = str(feat_eng_params[k].keys())
            elif k == 'impute_function':
                feat_eng_params[k] = str(feat_eng_params[k].__name__)
            else:
                feat_eng_params[k] = str(feat_eng_params[k])


#        for k in feat_eng_params:
#            feat_eng_params[k] = str(feat_eng_params[k])
        result = {
            'trainTime': time.time() - start_time,
            'trainShape': X_train.shape,
            'modelType': model_type,
            'featureEngParams': feat_eng_params,
            'modelId': params['pipeline_id'],
            'mean_cv_accuracy': mean_accuracy,
            'mean_cv_roc_auc': mean_roc_auc,
            'evaluated_models': evaluated_indivs
        }
        self.model_factory[params['pipeline_id']]['stats'] = result
        return json.dumps(result)
Exemple #9
0
def get_data(data_file, test_file):
    """ Produces training set, cross validation set and test set. """

    raw_data = utilities.read_file(data_file, True)
    test_data = utilities.read_file(test_file, True)
    x = array(raw_data, float64)
    y = x[:, 0]
    x = x[:, 1 : :]
    x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
        x, y, test_size=0.3, random_state=None)
    x = array(test_data, float64)
    y_test = x[:, 0]
    x_test = x[:, 1 : :]

    return (x_train, y_train, x_cv, y_cv, x_test, y_test)
Exemple #10
0
    def _before_install_new_packages(self):
        """
        Specific actions before new packages installation
        """
        print "The installation of MySQL for db_governor has started"

        check_file("/usr/local/directadmin/custombuild/build")
        check_file("/usr/local/directadmin/custombuild/options.conf")

        # MYSQL_DA_TYPE=`cat /usr/local/directadmin/custombuild/options.conf | grep mysql_inst= | cut -d= -f2`
        try:
            MYSQL_DA_TYPE = grep("/usr/local/directadmin/custombuild/options.conf", "mysql_inst=")[0].split("=")[1]
        except IndexError:
            MYSQL_DA_TYPE = ""

        if os.path.exists("/usr/share/lve/dbgovernor/da.tp.old"):
            if MYSQL_DA_TYPE == "no":
                MYSQL_DA_TYPE = read_file("/usr/share/lve/dbgovernor/da.tp.old")
            else:
                write_file("/usr/share/lve/dbgovernor/da.tp.old", MYSQL_DA_TYPE)
        else:
            write_file("/usr/share/lve/dbgovernor/da.tp.old", MYSQL_DA_TYPE)

        exec_command_out("/usr/local/directadmin/custombuild/build set mysql_inst no")

        self._mysqlservice("stop")
def remove_function_words():
    entries = read_file('data/cebposdict-nc.txt', strip=True, dict_format=True)

    function_tags = ['DET', 'PART', 'CONJ', 'PRON']
    function_words = {'DET': [], 'PART': [], 'CONJ': [], 'PRON': []}

    for tag in function_tags:
        for key, value in entries.iteritems():
            if tag in value:
                value.append('REM')
                function_words[tag].append(key)
    write_file('data/cebposdict-nc.txt',
               contents=[''],
               add_newline=False,
               mode='w')
    for key, value in sorted(entries.iteritems()):
        if 'REM' not in value:
            new_entry = [key + ' ']
            value = list(set(value))
            new_entry.append(' '.join(value))
            new_entry.append('\n')
            write_file('data/cebposdict-nc.txt',
                       contents=new_entry,
                       add_newline=False,
                       mode='a')
            new_entry = []

    for key, value in function_words.iteritems():
        write_file('data/' + key + '.txt',
                   contents=value,
                   add_newline=False,
                   append_newline=True,
                   mode='w')
Exemple #12
0
def generate_training_set(follow, followed, ratio, solution_file, data_file):
    """ Uses the solution file to generate training set to train
    the model, hoping this method can get better result.
    Ratio controls the fraction of pos and neg data sets, if ratio is -1,
    the fraction is the origion fraction."""

    raw_solution = utilities.read_file(solution_file, False)
    dict_solution = {}
    for i in range(len(raw_solution)):
        row = raw_solution[i]
        dict_solution[int(row[0])] = set(int(n) for n in row[1 : :])

    x_train = [['spring brother is a true man']]
    for node in dict_solution.keys():
        nodes_pos = dict_solution[node]
        for n in nodes_pos:
            features = rank.get_features(follow, followed, node, n)
            x_train.append([1] + features)

        nodes_neg = candidate.get_candidates(follow, followed, node)
        nodes_neg.difference_update(nodes_pos)
        nodes_neg = list(nodes_neg)
        perm = random.permutation(len(nodes_neg))
        if ratio != -1:
            num = min(int(len(nodes_pos) * ratio), len(nodes_neg))
        else:
            num = len(nodes_neg)
        for i in range(num):
            node = nodes_neg[perm[i]]
            features = rank.get_features(follow, followed, node, n)
            x_train.append([0] + features)

    utilities.write_file(data_file, x_train)
def to_panda_data():
    output = read_file('data/output_tokens.txt', strip=True)
    panda_data = []
    indexes = []
    for o in output:
        data = {}
        o = o.split(' ')
        indexes.append(o[0])
        data['root'] = o[1]

        data['is_root'] = o[0] == o[1]
        if o[2] != 'None':
            data['prefix'] = o[2]

        if o[3] != 'None':
            data['infix'] = o[3]

        if o[4] != 'None':
            data['suffix'] = o[4]

        data['is_entry'] = True if int(o[5]) == 1 else False

        if len(o) == 7:
            data['is_valid'] = True if int(o[6]) == 1 else False
        else:
            data['is_valid'] = 'Null'
        panda_data.append(data)

    return {'data': panda_data, 'index': indexes}
Exemple #14
0
def write_non_dominated_frontiers(time, files_names, possible_values,
                                  dir_path):
    char_n_file_gen = Wise_permutations(files_names)
    for file_tp, n in char_n_file_gen:
        # Create folder and move there if not exists
        os.chdir(dir_path)
        create_folder(f'{n}{file_tp}')

        print(f'\n\n -------- {n}{file_tp} --------- \n\n')
        os.chdir(f'{dir_path}{n}{file_tp}/')

        fp = f'~/Dropbox/PI/PI2/data/n{n}q10{file_tp}.dat'
        file_stations = util.read_file(fp)
        stations = read_stations(file_stations)  # list of stations
        Sol.set_stations(stations)

        # For every file let us calculate every frontier
        tuples_combinations = Wise_permutations(possible_values)
        for params in tuples_combinations:
            print('params --> ', params)
            # Train model
            n_pob, ratio_sons, ratio_mutation, num_random_sols = params
            solutions = SolCollection(n_pob=n_pob,
                                      ratio_sons=ratio_sons,
                                      ratio_mutation=ratio_mutation,
                                      num_random_sols=num_random_sols)
            non_dom_result = solutions.train_time(time)
            # Save file
            out_file_name = tuples_combinations.string_params(params) + '.txt'
            np.savetxt(out_file_name, non_dom_result)
Exemple #15
0
 def get_mysql_user(self):
     """
     Retrieve MySQL user name and password and save it into self attributes
     """
     if os.path.exists(self.DBPASS_PATH):
         self.MYSQLUSER = "******"
         self.MYSQLPASSWORD = read_file(self.DBPASS_PATH)
Exemple #16
0
 def read(self, filePath):
     """Return metadatas and content of a markdown file"""
     
     mdContent=read_file(filePath)
     md = markdown.Markdown(extensions = ['meta', 'codehilite'])
     htmlContent=md.convert(mdContent)
     return htmlContent,md.Meta
def resolve_equals():
    write_file('data/cebposdict-4.txt',
               contents=[''],
               no_encode=True,
               add_newline=False,
               mode='w')
    entries = read_file('data/cebposdict-3.txt', dict_format=True)
    result = []
    for key, value in entries.iteritems():
        words = nltk.word_tokenize(" ".join(value))
        new_entry = [key + ' ']
        related_words = []
        for word in words:
            if word in ['PART', 'ADJ', 'PRON', 'VERB', 'NOUN', 'NUM']:
                new_entry.append(word + ' ')
            elif word != '=':
                related_words.append(word)

        for rel_word in related_words:
            if rel_word in entries:
                values = entries[rel_word]
                words = nltk.word_tokenize(" ".join(value))
                # words = list(Text(" ".join(values)).words)
                for word in words:
                    if word in ['PART', 'ADJ', 'PRON', 'VERB', 'NOUN', 'NUM']:
                        new_entry.append(word + ' ')

        new_entry.append('\n')
        write_file('data/cebposdict-4.txt',
                   contents=new_entry,
                   add_newline=False,
                   mode='a')
        new_entry = []

    print('resolve_equals: Finished!')
Exemple #18
0
def scrape_news_links():
    links = read_file('data/scraped/news-links.txt')
    if len(links) == 500:
        print("Status: Finished!\n")
        return

    url = "http://www.sunstar.com.ph/superbalita-cebu/balita"
    main_url = urlparse.urlparse(url).scheme + '://' + urlparse.urlparse(
        url).hostname
    stop_scraping_process = False
    i = 0
    limit = 500
    while i < limit and not stop_scraping_process:
        page = urlopen(url)
        soup = BeautifulSoup(page, 'html.parser')
        titles = soup.findAll('h3', {'class': 'title'})
        for title in titles:
            child = title.findChildren()[0]
            write_file("data/scraped/news-links.txt",
                       contents=[main_url + child.get('href')],
                       mode="a")
            print(main_url + child.get('href'))
            print("\n")
            i += 1
            if i == limit:
                break

        next_page = soup.find('a', {'title': 'Go to next page'})
        if next_page:
            url = main_url + next_page.get('href')
        else:
            stop_scraping_process = True
def generate_training_set(follow, followed, ratio, solution_file, data_file):
    """ Uses the solution file to generate training set to train
    the model, hoping this method can get better result.
    Ratio controls the fraction of pos and neg data sets, if ratio is -1,
    the fraction is the origion fraction."""

    raw_solution = utilities.read_file(solution_file, False)
    dict_solution = {}
    for i in range(len(raw_solution)):
        row = raw_solution[i]
        dict_solution[int(row[0])] = set(int(n) for n in row[1::])

    x_train = [['spring brother is a true man']]
    for node in dict_solution.keys():
        nodes_pos = dict_solution[node]
        for n in nodes_pos:
            features = rank.get_features(follow, followed, node, n)
            x_train.append([1] + features)

        nodes_neg = candidate.get_candidates(follow, followed, node)
        nodes_neg.difference_update(nodes_pos)
        nodes_neg = list(nodes_neg)
        perm = random.permutation(len(nodes_neg))
        if ratio != -1:
            num = min(int(len(nodes_pos) * ratio), len(nodes_neg))
        else:
            num = len(nodes_neg)
        for i in range(num):
            node = nodes_neg[perm[i]]
            features = rank.get_features(follow, followed, node, n)
            x_train.append([0] + features)

    utilities.write_file(data_file, x_train)
def contextual_rules():
    raw = read_file('data/rules/CONTEXTUAL.txt', strip=True)
    rules = []

    for r in raw:
        rule = ContextualRule()

        string = r.split(' ')

        rule.operator = string[0]
        rule.target = string[1]

        i = 2
        while i < len(string):
            condition = ContextCondition()

            position = string[i]

            if 'C' in position:
                condition.careful_mode = True
                position = position.replace('C', '')

            condition.position = int(position)
            condition.pos_tag = string[i + 1]

            rule.context_conditions.append(condition)

            i += 2

        rules.append(rule)

    return rules
    def _get_new_version(self):
        """
        Get new sql version for install
        """
        if os.path.exists(self.NEW_VERSION_FILE):
            return read_file(self.NEW_VERSION_FILE)

        return "auto"
Exemple #22
0
    def _get_new_version(self):
        """
        Get new sql version for install
        """
        if os.path.exists(self.NEW_VERSION_FILE):
            return read_file(self.NEW_VERSION_FILE)

        return "auto"
Exemple #23
0
 def upload_release_to_github(self):
     # Draft the upload to github
     release_notes = read_file(self.details.new_release_notes_path)
     pyperclip.copy(release_notes)
     print('The release notes are on the clipboard')
     github_url = F"'https://github.com/approvals/ApprovalTests.cpp/releases/new?tag={self.details.new_version}&title=Single%20Hpp%20File%20-%20{self.details.new_version}'"
     run(["open", github_url])
     run(["open", self.details.release_dir])
     check_step("that the release is published")
Exemple #24
0
def get_entries():
    entries = read_file(name='data/cebposdict.txt',
                        strip=True,
                        dict_format=True)

    func_words = ['CONJ', 'DET', 'PART', 'PRON']

    for func in func_words:
        words = read_file(name='data/function_words/' + func + '.txt',
                          strip=True)

        for word in words:
            if word in entries:
                entries[word].append(func)
            else:
                entries[word] = [func]

    return entries
def mean_average_precision(result_file, solution_file):
    """ Calculates the mean average precision. """

    raw_result = utilities.read_file(result_file, True)
    raw_solution = utilities.read_file(solution_file, False)
    dict_result = {}
    for row in raw_result:
        dict_result[row[0]] = row[1::]
    dict_solution = {}
    for row in raw_solution:
        dict_solution[row[0]] = set(row[1::])

    res = 0.0
    for key in dict_result.keys():
        prediction = dict_result[key][0].split()
        ground_truth = dict_solution[key]
        res += ap(ground_truth, prediction)
    res /= len(dict_result)
    print 'mean average precision = %f' % res
Exemple #26
0
def mean_average_precision(result_file, solution_file):
    """ Calculates the mean average precision. """

    raw_result = utilities.read_file(result_file, True)
    raw_solution = utilities.read_file(solution_file, False)
    dict_result = {}
    for row in raw_result:
        dict_result[row[0]] = row[1 : :]
    dict_solution = {}
    for row in raw_solution:
        dict_solution[row[0]] = set(row[1 : :])

    res = 0.0
    for key in dict_result.keys():
        prediction = dict_result[key][0].split()
        ground_truth = dict_solution[key]
        res += ap(ground_truth, prediction)
    res /= len(dict_result)
    print 'mean average precision = %f' % res
Exemple #27
0
def baseline(training_file, submission_file, output_file):
    data = utilities.read_file(training_file)
    sub_data = utilities.read_file(submission_file, True)

    print 'Calculating hour averages...'
    hour_avg_by_chunk = utilities.get_hour_avg_by_chunk(data)
    hour_avg = utilities.get_hour_avg(data)

    print 'Filling submission file...'
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        for j in range(5, len(sub_data[i])):
            if sub_data[i][j] == '0':
                if chunk_id in hour_avg_by_chunk:
                    sub_data[i][j] = hour_avg_by_chunk[chunk_id][hour][j - 5]
                else:
                    sub_data[i][j] = hour_avg[hour][j - 5]

    utilities.write_file(output_file, sub_data)
 def show_user_manual(self, widget, help_window, parent_window):
     """ Shows the configuration window.
     parameters:
         [AppGTK] self -- the self instance.
         [gtk.Widget] widget -- the widget event.
         [gtk.Object] config_window -- The configuration window object.
         [gtk.Object] config_window -- The start window object(parent).
     """
     if help_window and parent_window:
         self.help_label.set_text(utilities.read_file("user_manual.txt"))
         help_window.show()
Exemple #29
0
def baseline(training_file, submission_file, output_file):
    data = utilities.read_file(training_file)
    sub_data = utilities.read_file(submission_file, True)

    print 'Calculating hour averages...'
    hour_avg_by_chunk = utilities.get_hour_avg_by_chunk(data)
    hour_avg = utilities.get_hour_avg(data)

    print 'Filling submission file...'
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        for j in range(5, len(sub_data[i])):
            if sub_data[i][j] == '0':
                if chunk_id in hour_avg_by_chunk:
                    sub_data[i][j] = hour_avg_by_chunk[chunk_id][hour][j - 5]
                else:
                    sub_data[i][j] = hour_avg[hour][j - 5]

    utilities.write_file(output_file, sub_data)
Exemple #30
0
def read_shopping_articles_from_file_to_json():
    path = os.getcwd()
    path = os.path.join(path, "data/")
    file = "shopping_articles.txt"
    raw_file = utils.read_file(base_path=path, filename=file, file_type="json")
    if type(raw_file) == dict:
        articles_json = raw_file
    else:
        articles_json = None
        logging.error(f"Read in file is type {type(raw_file)}!")
    return articles_json
 def show_user_manual(self, widget, help_window, parent_window):
     """ Shows the configuration window.
     parameters:
         [AppGTK] self -- the self instance.
         [gtk.Widget] widget -- the widget event.
         [gtk.Object] config_window -- The configuration window object.
         [gtk.Object] config_window -- The start window object(parent).
     """
     if help_window and parent_window:
         self.help_label.set_text(utilities.read_file("user_manual.txt"))
         help_window.show()
Exemple #32
0
    def update_conan_config_yml(self, conan_approvaltests_dir,
                                new_version_without_v):
        conan_data_file = os.path.join(conan_approvaltests_dir, 'config.yml')
        conandata_yml_text = read_file(conan_data_file)

        conan_data = \
F'''  {new_version_without_v}:
    folder: all
'''
        conandata_yml_text += conan_data

        write_file(conan_data_file, conandata_yml_text)
Exemple #33
0
    def _detect_version_if_auto(self):
        """
        Detect vesrion of MySQL if mysql.type is auto
        """
        print "Detect MySQL version for AUTO"

        check_file("/usr/local/directadmin/custombuild/build")
        check_file("/usr/local/directadmin/custombuild/options.conf")
        MYSQL_DA_VER = ""

        # MYSQL_DA_TYPE=`cat /usr/local/directadmin/custombuild/options.conf | grep mysql_inst= | cut -d= -f2`
        try:
            MYSQL_DA_VER = grep("/usr/local/directadmin/custombuild/options.conf", "mysql=")[0].split("=")[1].strip()
            MYSQL_DA_TYPE = grep("/usr/local/directadmin/custombuild/options.conf", "mysql_inst=")[0].split("=")[1].strip()
        except IndexError:
            MYSQL_DA_VER = ""
            MYSQL_DA_TYPE = ""
        if MYSQL_DA_TYPE == "no":
            if os.path.exists("/usr/share/lve/dbgovernor/da.tp.old"):
                MYSQL_DA_TYPE = read_file("/usr/share/lve/dbgovernor/da.tp.old")
            elif os.path.exists("/usr/bin/mysql"):
                result = exec_command("/usr/bin/mysql -V | grep -c 'MariaDB' -i || true", True)
                if result == "0":
                    MYSQL_DA_TYPE = "mysql"
                else:
                    MYSQL_DA_TYPE = "mariadb"

        print "I got %s and %s" % (MYSQL_DA_VER, MYSQL_DA_TYPE)

        mysql_version_map = {
            "5.0": "mysql50",
            "5.1": "mysql51",
            "5.5": "mysql55",
            "5.6": "mysql56",
            "5.7": "mysql57",
            "10.0.0": "mariadb100",
            "10.1.1": "mariadb101"
        }
        mariadb_version_map = {
            "10.1": "mariadb101",
            "10.0": "mariadb100",
            "5.6": "mariadb100",
            "5.5": "mariadb100",
            "10.0.0": "mariadb100",
            "10.1.1": "mariadb100"
        }

        if MYSQL_DA_TYPE == "mysql":
            MYSQL_DA_VER = mysql_version_map[MYSQL_DA_VER]
        elif MYSQL_DA_TYPE == "mariadb":
            MYSQL_DA_VER = mariadb_version_map[MYSQL_DA_VER]

        return MYSQL_DA_VER
def correct_NUM():
    contents = read_file('data/old-NUM.txt', strip=True, dict_format=True)
    entries = read_file('data/cebposdict-nc.txt', strip=True, dict_format=True)

    for content in contents:
        entries[content] = ['NUM']

    write_file('data/cebposdict-nc.txt',
               contents=[''],
               add_newline=False,
               mode='w')
    for key, value in sorted(entries.iteritems()):
        new_entry = [key + ' ']
        value = list(set(value))
        new_entry.append(' '.join(value))
        new_entry.append('\n')
        write_file('data/cebposdict-nc.txt',
                   contents=new_entry,
                   add_newline=False,
                   mode='a')
        new_entry = []
Exemple #35
0
def main():
    hamlet = read_file(filepath)

    hamlet_cleaned = clean_text(hamlet)

    #hamlet_wordcount = wordcount(hamlet_cleaned)
    #hamlet_wordcount = wordcount_counter(hamlet_cleaned)
    #hamlet_wordcount = wordcount_dd(hamlet_cleaned)
    hamlet_wordcount = word_count_err_handling(hamlet_cleaned)

    #print(hamlet_wordcount.most_common(50))
    print(hamlet_wordcount)
Exemple #36
0
def avg(training_file, submission_file, output_file):
    data = utilities.read_file(training_file)

    train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data)
    targets_train, targets_cv = preprocess.get_train_cv_targets(
        train_data, cv_data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg,
     weekday_avg) = feature_extraction.get_avg_maps(train_data)

    x_train_all, x_cv_all = feature_extraction.get_x_by_avg(
        train_data, cv_data, chunk_avg, hour_avg_by_chunk,
        weekday_avg_by_chunk, hour_avg, weekday_avg)

    clfs = regression.linear_regression(x_train_all, x_cv_all, targets_train,
                                        targets_cv)
    clfs = regression.random_forest(x_train_all, x_cv_all, targets_train,
                                    targets_cv)

    print 'Filling submission file...'
    sub_data = utilities.read_file(submission_file, True)
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        weekday = ''
        all_features = feature_extraction.get_features(chunk_id, weekday, hour,
                                                       chunk_avg,
                                                       hour_avg_by_chunk,
                                                       weekday_avg_by_chunk,
                                                       hour_avg, weekday_avg)

        for j in range(5, len(sub_data[i])):
            if sub_data[i][j] == '0':
                feature = []
                for f in all_features:
                    feature.append(f[j - 5])
                sub_data[i][j] = clfs[j - 5].predict([feature])[0]

    utilities.write_file(output_file, sub_data)
def test_read_file():
    in_file = 'file_that_doesnt_exist'

    def file_reader(**kwargs):
        csv = kwargs['csv_reader']
        return None

    try:
        out = util.read_file(input_file=in_file, func=file_reader)
    except IOError:
        assert (True)
    else:
        assert (False)
Exemple #38
0
def avg(training_file, submission_file, output_file):
    data = utilities.read_file(training_file)

    train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data)
    targets_train, targets_cv = preprocess.get_train_cv_targets(
        train_data, cv_data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
     hour_avg, weekday_avg) = feature_extraction.get_avg_maps(train_data)

    x_train_all, x_cv_all = feature_extraction.get_x_by_avg(
            train_data, cv_data, chunk_avg, hour_avg_by_chunk,
             weekday_avg_by_chunk, hour_avg, weekday_avg)

    clfs = regression.linear_regression(
        x_train_all, x_cv_all, targets_train, targets_cv)
    clfs = regression.random_forest(
        x_train_all, x_cv_all, targets_train, targets_cv)

    print 'Filling submission file...'
    sub_data = utilities.read_file(submission_file, True)
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        weekday = ''
        all_features = feature_extraction.get_features(
            chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk,
            weekday_avg_by_chunk, hour_avg, weekday_avg)

        for j in range(5, len(sub_data[i])):
            if sub_data[i][j] == '0':
                feature = []
                for f in all_features:
                    feature.append(f[j - 5])
                sub_data[i][j] = clfs[j - 5].predict([feature])[0]

    utilities.write_file(output_file, sub_data)
Exemple #39
0
def analyze_candidates(solution_file, follow, followed):
    """ Analyzes the method get_candidates. """

    raw_solution = utilities.read_file(solution_file, False)
    dict_solution = {}
    for row in raw_solution:
        dict_solution[int(row[0])] = set(int(n) for n in row[1 : :])

    count_total = 0
    count_miss = 0
    for node in dict_solution:
        candidates = candidate.get_candidates(follow, followed, node)
        for n in dict_solution[node]:
            if n not in candidates:
                count_miss += 1
        count_total += len(dict_solution[node])

    print 'count_total = %d, count_miss = %d' %(
        count_total, count_miss)
Exemple #40
0
def time_series(training_file, submission_file, output_file):
    data = utilities.read_file(training_file, True)
    first_line = data[0]
    data = data[1 : :]
    data = preprocess.fill_NAs(data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
     hour_avg, weekday_avg) = feature_extraction.get_avg_maps(data)

    clf_map = regression.linear_regression_2(data)

    print 'Filling submission file...'
    chunk_map = utilities.get_chunk_map(data, 1)
    sub_data = utilities.read_file(submission_file, True)

    positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        pos = positions[(i - 1) % 10]
        for j in range(5, len(sub_data[i])):
            target = j - 5
            if sub_data[i][j] == '0':
                if not chunk_id in chunk_map:
                    sub_data[i][j] = hour_avg[hour][target]
                else:
                    data_in_chunk = chunk_map[chunk_id]
                    start = len(data_in_chunk) - 24
                    t = len(data_in_chunk[0]) - 39 + target
                    features = []
                    prev_hour = 0
                    for k in range(start, len(data_in_chunk)):
                        features.append(float(data_in_chunk[k][t]))
                        if data_in_chunk[k][5] == hour:
                            prev_hour = float(data_in_chunk[k][t])

                    features.append(prev_hour)

                    # Binary hour features.
                    for h in range(24):
                        if h == int(hour):
                            features.append(1)
                        else:
                            features.append(0)

                    # Binary month features.
                    month = int(sub_data[i][4])
                    for m in range(1, 13):
                        if m == month:
                            features.append(1)
                        else:
                            features.append(0)

                    # Weather features.
                    tmp_length = len(data_in_chunk)
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length - 1][k]))
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length - 2][k]))

                    sub_data[i][j] = \
                        clf_map[(target, pos)].predict([features])[0]

    utilities.write_file(output_file, sub_data)