def dump_data(config):
    train_raw, dev_raw = get_raw_data(config)

    dev_data = mfcc_all(dev_raw, config)
    pickle.dump(dev_data, open(os.path.join(config['mfcc']['data_path'], config['data']['dev_file']), 'wb'))

    train_data = mfcc_all(train_raw, config)
    pickle.dump(train_data, open(os.path.join(config['mfcc']['data_path'], config['data']['train_file']), 'wb'))
def import_script(script_name=None, level_name=None):
    # get name of script file
    if not script_name: #if script_file not defined
        while True:
            print "Please enter the name of the script file."
            print "The current working directory is "
            print
            print os.getcwd()
            script_name = raw_input("Path to .6vscript file: ")
            if not script_name:
                print "You must specify a script to import."
                continue
            else:
                try:
                    with open(script_name): pass
                except IOError:
                    print 'File not found.'
                else:
                    break

        print

    # Checks whether level_name specified beforehand (for quiet execution)
    while not level_name: 
        print "Please enter the filename of the level"
        print "(do not include .vvvvvv or else bad things will happen)"
        level_name = utils.get_level_name()

        if not level_name:
            print "You must enter a level name"

    # backup level file
    print "Backing up level file..."
    backup_file = utils.level_backup(level_name)
    print "Backup saved to " + backup_file

    # get raw level data from file
    level_data = utils.get_raw_data(utils.get_vvvvvv_dir(), level_name)

    # get raw script data from file
    raw_script_data = utils.get_script_filedata(script_name)

    # convert script data to raw data
    script_data = utils.script_to_raw(raw_script_data)

    if not script_data:
        raise IOError

    # Adding script data to level data in memory
    utils.import_script_data(level_data, script_data)

    # going hot!
    success = utils.write_level_data(utils.get_vvvvvv_dir(), level_name, level_data)
    if success:
        print "File successfully written."

    else:
        print "An error occurred when writing the file."
Beispiel #3
0
def run():
    confirmed_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    deaths_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
    recovered_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

    confirmed_total_data = get_raw_data(confirmed_CSV_URL)
    deaths_total_data = get_raw_data(deaths_CSV_URL)
    recovered_total_data = get_raw_data(recovered_CSV_URL)

    korea_deaths = extract_data(confirmed_total_data)
    korea_recovered = extract_data(recovered_total_data)
    korea_confirmed, date = extract_data(deaths_total_data, return_dates=True)
    korea_data = list(zip(date, korea_confirmed, korea_deaths,
                          korea_recovered))

    result = build_result(korea_data)

    save_dir = './data/koreaRegionalCumulativeData.js'
    crawler_name = 'crawlKoreaRegionalCumulativeData.py'
    var_name = 'koreaRegionalCumulativeData'

    write_data(result, save_dir, crawler_name, var_name)
def data_loader(args):
    train_data, train_labels = utils.get_raw_data(args.train_file)         # 获取一堆句子构成的列表
    val_data, val_labels = utils.get_raw_data(args.dev_file)

    args.catogories = ['EnterSports', 'Military', 'Economics', 'Technology', 'Government']
    args.cat_dict = dict(zip(args.catogories, range(len(args.catogories))))

    word_vocab, num_total_words = utils.build_dict(train_data)

    trainlabels_to_idx = [args.cat_dict[label] for label in train_labels]
    vallabels_to_idx = [args.cat_dict[label] for label in val_labels]

    train_data, train_labels = utils.encode(train_data, trainlabels_to_idx, word_vocab)
    val_data, val_labels = utils.encode(val_data, vallabels_to_idx, word_vocab)

    train_data = utils.pad_features(train_data, max_len=args.max_features)
    val_data = utils.pad_features(val_data, max_len=args.max_features)

    train_set = utils.batch(train_data.copy(), train_labels.copy(), args.batch_size)
    val_set = utils.batch(val_data.copy(), val_labels.copy(), args.batch_size)

    return train_set, val_set, num_total_words
Beispiel #5
0
def run():
    confirmed_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    deaths_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
    recovered_CSV_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

    confirmed_total_data = get_raw_data(confirmed_CSV_URL)
    deaths_total_data = get_raw_data(deaths_CSV_URL)
    recovered_total_data = get_raw_data(recovered)

    total_data, num_date = build_total_data(confirmed_total_data)

    countries = [
        'US', 'United Kingdom', 'Australia', 'Canada', 'China', 'Congo',
        'Denmark', 'France', 'Netherlands'
    ]

    final_data = concat_countries_data(total_data, num_date, countries)

    save_dir = './data/HopkinsCoronaWorldData.js'
    crawler_name = 'Hopkins_world_data_parser.py'
    var_name = 'hopkinsData'

    write_data(final_data, save_dir, crawler_name, var_name)
Beispiel #6
0
def fetch_osm_pumps(path, outpath):

    create_folder(path)

    # specify query
    # (area["ISO3166-2"="DE-BE"][admin_level=4]; )->.searchArea;(node["man_made"="water_well"]["network"="Berliner Straßenbrunnen"](area.searchArea););
    query_string = "http://overpass-api.de/api/interpreter?data=%5Bout%3Ajson%5D%3B%28area%5B%22ISO3166%2D2%22%3D%22DE%2DBE%22%5D%5B%22admin%5Flevel%22%3D%224%22%5D%3B%29%2D%3E%2EsearchArea%3B%28node%5B%22man%5Fmade%22%3D%22water%5Fwell%22%5D%5B%22network%22%3D%22Berliner%20Straßenbrunnen%22%5D%28area%2EsearchArea%29%3B%29%3Bout%3B%3E%3Bout;"

    # get data and write to json
    raw_data = get_raw_data(query_string)
    json = raw_data.json()
    
    # transform and write to dataframe
    gdf = get_overpass_gdf(json)
    cleaned_gdf = transform_dataframe(gdf)
    write_df_to_json(cleaned_gdf,outpath)
def main():
    model_type = "lda"
    exp_name = "PCA/scaled/all_samples"

    if exp_name.split("/")[1] == "scaled":
        scaled = True
    else:
        scaled = False

    all_sample_results = np.zeros((21, 50))

    for i, sample in enumerate(range(1, 22)):
        print("sample {}".format(sample))

        if exp_name == "raw/all_samples":
            x_train, y_train = get_raw_data(sample, scale=scaled)
        else:
            epochs = get_epochs(sample, scale=scaled)
            reduced_data = pca(80, epochs, plot=False)
            x_train = reduced_data.transpose(0, 2, 1).reshape(-1, reduced_data.shape[1])
            y_train = get_y_train(sample)

        results = linear_models(x_train, y_train, model_type=model_type)
        all_sample_results[i] = results

        sns.set()
        ax = sns.lineplot(data=results, dashes=False)
        ax.set(ylim=(0, 0.6), xlabel='Time', ylabel='Accuracy',
               title='Cross Val Accuracy {} for sample {}'.format(model_type, sample))
        plt.axvline(x=15, color='b', linestyle='--')
        ax.figure.savefig("Results/{}/{}/sample{}".format(model_type, exp_name, sample), dpi=300)
        # plt.show()
        plt.clf()

    all_results_df = pd.DataFrame(all_sample_results)
    all_results_df.to_csv("Results/{}/{}/all_sample_results.csv".format(model_type, exp_name))

    average_results = np.mean(all_sample_results, axis=0)
    sns.set()
    ax = sns.lineplot(data=average_results, dashes=False)
    ax.set(ylim=(0, 0.6), xlabel='Time', ylabel='Accuracy',
           title='Average Cross Val Accuracy {} across all samples'.format(model_type))
    plt.axvline(x=15, color='b', linestyle='--')
    ax.figure.savefig("Results/{}/{}/average_all_samples".format(model_type, exp_name), dpi=300)
    # plt.show()
    plt.clf()
Beispiel #8
0
                    help='batch normalization')
parser.set_defaults(shuffle=False)
args = parser.parse_args()

print args

assert args.word_vector_size in [50, 100, 200, 300]

network_name = args.prefix + '%s.mh%d.n%d.bs%d%s%s%s.babi%s' % (
    args.network, args.memory_hops, args.dim, args.batch_size, ".na"
    if args.normalize_attention else "", ".bn" if args.batch_norm else "",
    (".d" + str(args.dropout)) if args.dropout > 0 else "",
    args.input_train.split("/")[-1])

# Go and get the data from the folders; see utils class.
train_raw, test_raw = utils.get_raw_data(args.input_train, args.input_test)

# Initialize word2vec with utils.load_glove
word2vec = utils.load_glove(args.word_vector_size)

args_dict = dict(args._get_kwargs())
args_dict['train_raw'] = train_raw
args_dict['test_raw'] = test_raw
args_dict['word2vec'] = word2vec

# init class
if args.network == 'dmn_batch':
    import dmn_batch
    dmn = dmn_batch.DMN_batch(**args_dict)

# The basic module is implemented for document similarity
Beispiel #9
0
import pdf2image
from utils import draw_boxes
from pathlib import Path
from utils import get_raw_data, merge_blocks, create_order, get_blocks, remove_empty

if __name__ == '__main__':
    pdf_dir = "/home/mahad/abbyy_dummy_dataset/pdf"
    xml_dir = "/home/mahad/abbyy_dummy_dataset/xml"
    save_dir = "/tmp"
    pdf_files = os.listdir(pdf_dir)
    xml_files = os.listdir(xml_dir)
    for xml_file in xml_files:
        print(xml_file)
        xml_path = os.path.join(xml_dir, xml_file)
        pdf_path = os.path.join(pdf_dir, Path(xml_file).stem + ".pdf")
        xml_data = get_raw_data(xml_path)
        for page in xml_data:
            para_boxes = page["para_boxes"]
            para_texts = page["para_texts"]
            para_boxes, para_texts = remove_empty(para_boxes, para_texts)
            tables = page["tables"]
            table_boxes = [tt["bbox"] for tt in tables]
            table_texts = [tt["rows"] for tt in tables]
            img = pdf2image.convert_from_path(pdf_path, size=(page["width"], page["height"]),
                                              first_page=page["page_number"], last_page=page["page_number"])
            img = np.asarray(img[0])
            all_boxes = para_boxes + table_boxes
            all_texts = para_texts + table_texts
            column_blocks = get_blocks((page["height"], page["width"]), all_boxes)
            column_blocks_merged = merge_blocks(column_blocks, all_boxes)
            ordered_boxes = create_order(column_blocks_merged, all_boxes)
Beispiel #10
0
assert args.word_vector_size in [50, 100, 200, 300]

network_name = args.prefix + "%s.mh%d.n%d.bs%d%s%s%s.babi%s" % (
    args.network,
    args.memory_hops,
    args.dim,
    args.batch_size,
    ".na" if args.normalize_attention else "",
    ".bn" if args.batch_norm else "",
    (".d" + str(args.dropout)) if args.dropout > 0 else "",
    args.input_train.split("/")[-1],
)

# Go and get the data from the folders; see utils class.
train_raw, test_raw = utils.get_raw_data(args.input_train, args.input_test)

# Initialize word2vec with utils.load_glove
word2vec = utils.load_glove(args.word_vector_size)

args_dict = dict(args._get_kwargs())
args_dict["train_raw"] = train_raw
args_dict["test_raw"] = test_raw
args_dict["word2vec"] = word2vec


# init class
if args.network == "dmn_batch":
    import dmn_batch

    dmn = dmn_batch.DMN_batch(**args_dict)
Beispiel #11
0
from lxml import etree
from utils import get_raw_data


def retrieve_style_text(xml_file, style):
    xml_tree = etree.parse(xml_file)
    text = []
    locs = []
    for elem in xml_tree.iter():
        if elem.tag.count('charParams') > 0 and elem.attrib["style"] == style:
            text.append(elem.text)
            locs.append([elem.attrib["l"], elem.attrib["t"], elem.attrib["r"], elem.attrib["b"]])
    return text, locs


if __name__ == '__main__':
    xml_file = "/home/mahad/abbyy_dummy_dataset/xml/Original Doc_Alpha FDI Holdings Pte. Ltd. (1).xml"
    results = get_raw_data(xml_file)
# -*- coding: utf-8 -*-
import regex as re
import json
import os
from collections import OrderedDict

from utils import get_raw_data

os.chdir(os.path.dirname(os.path.abspath(__file__)))
get_raw_data()

# Languages with insufficient translation data are excluded
avoid_languages = ['cu', 'kkj', 'nds', 'prg', 'tk', 'vai', 'vai-Latn', 'vai-Vaii', 'vo']


def _get_language_locale_dict():
    cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/"
    available_locale_names = os.listdir(cldr_dates_full_dir)
    available_language_names = [shortname for shortname in available_locale_names
                                if not re.search(r'-[A-Z0-9]+$', shortname)]
    available_language_names.remove('root')
    language_locale_dict = {}
    for language_name in available_language_names:
        language_locale_dict[language_name] = []
        for locale_name in available_locale_names:
            if re.match(language_name + '-[A-Z0-9]+$', locale_name):
                language_locale_dict[language_name].append(locale_name)

    for language in avoid_languages:
        del language_locale_dict[language]
    return language_locale_dict
def extract(level_name=None, save_file=None):
    # Initializing variables
    filedata = ""
    script_data = None
    vvvvvv_dir = None

    # Get current opsys
    vvvvvv_dir = utils.get_vvvvvv_dir()

    # Checks whether level_name specified beforehand (for quiet execution)
    if not level_name:
        # request filename from user
        while True:
            level_name = None
            level_name = utils.get_level_name()
            if not level_name:
                print "You must enter a level name"
                continue
            
            # get level data
            raw_data = utils.get_raw_data(vvvvvv_dir, level_name)

            if not raw_data:
                print "Error: level does not exist"
                continue
            else:
                break

    else:
        raw_data = utils.get_raw_data(vvvvvv_dir, level_name)

    # get script data
    script_data = utils.get_script_data(raw_data)

    if not script_data:
        print "No script found"
        quit()

    final_data = utils.cleanup_data(script_data)

    print "Done!"

    # checks if save_file specified beforehand (for quiet execution)
    if not save_file:
        cwd = os.getcwd()
        print
        print "What file do you wish me to save the data to?"
        print "Current working directory is: "
        print
        print cwd
        print
        print "You may enter a filename to save in current directory,"
        print "enter a relative path, or a full path."
        print
        print "Else, press return to accept the default, which is: "
        print
        print level_name + ".6vscript"
        print
        save_file = raw_input("Save file: ")
        if not save_file:
            save_file = level_name + ".6vscript"

    else:
        pass

    with open(save_file, 'w') as outfile:
        for line in final_data:
            outfile.write(line + '\n')

        print save_file + " written"
Beispiel #14
0
def test_get_raw_data(query_fixture):
    response = get_raw_data(query_fixture)
    assert isinstance(response, Response)
    assert response.ok
Beispiel #15
0
def getData(building, zone, date):
    """Whatever data we get should be stored.
	date: in PST"""

    root, dirs, files = os.walk("CacheThanos/").next()
    Flag = False
    for index, thefile in enumerate(files, start=1):
        if str(building) + str(zone) + str(date) + ".dat" == thefile:
            Flag = True

    if Flag == False:

        # get config
        cfg = utils.get_config(building)
        zone_cfg = utils.get_zone_config(building, zone)

        events = []
        zone_log = utils.get_zone_formalog(building, zone)
        if zone_log:
            for line in zone_log:
                dateLog = utils.get_mdal_string_to_datetime(
                    line.split(" : ")[0])
                dateLog = dateLog.astimezone(pytz.timezone("US/Pacific"))
                if dateLog.date() == date.date():
                    events.append((int(
                        (dateLog.replace(tzinfo=None) -
                         date.replace(tzinfo=None)).total_seconds() / 60),
                                   line.split(" : ")[1]))

        interval = cfg["Interval_Length"]

        # client = utils.choose_client(cfg)
        client = get_client()

        start = date.replace(hour=0, minute=0, second=0)
        end = date.replace(day=date.day + 1, hour=0, minute=0, second=0)

        # Generate utc times. Use UTC for any archiver getting methods.
        pst_pytz = pytz.timezone("US/Pacific")

        start_pst = pst_pytz.localize(start)
        start_utc = start_pst.astimezone(pytz.timezone("UTC"))

        end_pst = pst_pytz.localize(end)
        end_utc = end_pst.astimezone(pytz.timezone("UTC"))

        datamanager = DataManager(cfg, zone_cfg, client, zone, now=start_utc)

        # get setpoints
        ground_truth_setpoints_df = datamanager.thermostat_setpoints(
            start_utc, end_utc)[zone]  # from archiver
        ground_truth_setpoints_df.index = ground_truth_setpoints_df.index.tz_convert(
            pst_pytz)

        config_setpoints_df = datamanager.better_comfortband(start)
        safety_setpoints_df = datamanager.better_safety(start)

        config_setpoints = config_setpoints_df[["t_low", "t_high"]].values
        safety_setpoints = safety_setpoints_df[["t_low", "t_high"]].values

        # Get tstat and weather data
        thermal_data_manager = ThermalDataManager(cfg, client)

        inside_data, outside_data = utils.get_raw_data(building=building,
                                                       client=client,
                                                       cfg=cfg,
                                                       start=start_utc,
                                                       end=end_utc,
                                                       force_reload=True)
        zone_inside_data = inside_data[zone]
        zone_inside_data.index = zone_inside_data.index.tz_convert(pst_pytz)
        outside_data = thermal_data_manager._preprocess_outside_data(
            outside_data.values())
        outside_data.index = outside_data.index.tz_convert(pst_pytz)
        outside_data = outside_data.resample("1T").interpolate()

        Tin = zone_inside_data["t_in"].values
        if np.isnan(Tin).any():
            print "Warning: Tin contains NaN. Estimates are based on interpolations"
            nans, x = nan_helper(Tin)
            Tin[nans] = np.interp(x(nans), x(~nans), Tin[~nans])

        # TODO shitty hack
        # taking the raw data and putting it into a data frame full of nan. Then, interpolating the data to get
        # data for the whole day.
        Tout = pd.DataFrame(columns=["t_out"],
                            index=pd.date_range(start=start,
                                                end=end,
                                                freq="1T"))
        Tout.index = Tout.index.tz_localize(pst_pytz)
        Tout["t_out"][outside_data.index[0]:outside_data.
                      index[-1]] = outside_data["t_out"]
        Tout = Tout.ffill()["t_out"].values[:1440]

        Policy = zone_inside_data["action"].values

        # Prepare discomfort
        discomfortManager = Discomfort(setpoints=config_setpoints)

        # get occupancies
        occupancy_config = datamanager.better_occupancy_config(start)
        try:
            occupancy_ground = datamanager.occupancy_archiver(start=start,
                                                              end=end)
        except:
            if zone_cfg["Advise"]["Occupancy_Sensors"] == True:
                print("Warning, could not get ground truth occupancy.")
            occupancy_ground = None

        if occupancy_ground is None:
            occupancy_use = occupancy_config
        else:
            occupancy_use = occupancy_ground

        occupancy_use = occupancy_use["occ"].values

        discomfort = []
        for i in range(len(Tin)):
            # for the ith minute
            print len(Tin), len(occupancy_use)
            assert len(Tin) <= len(occupancy_use)
            tin = Tin[i]
            occ = occupancy_use[i]
            discomfort.append(
                discomfortManager.disc(t_in=tin,
                                       occ=occ,
                                       node_time=i,
                                       interval=1))

        # get consumption and cost and prices
        prices = datamanager.better_prices(start).values
        heating_consumption = zone_cfg["Advise"]["Heating_Consumption"]
        cooling_consumption = zone_cfg["Advise"]["Cooling_Consumption"]

        energy_manager = EnergyConsumption(prices,
                                           interval,
                                           now=None,
                                           heat=heating_consumption,
                                           cool=cooling_consumption)
        cost = []
        for i in range(len(Policy)):
            # see it as the ith minute. That's why we need the assert
            assert len(Policy) <= len(prices)
            action = Policy[i]
            cost.append(energy_manager.calc_cost(action=action, time=i))
        cost = np.array(cost)

        # Cache the data and check if already downloaded!
        OPs = occupancy_use[:1440]

        TinsUPComfortBand = config_setpoints_df["t_high"][:1440]

        TinsDOWNComfortBand = config_setpoints_df["t_low"][:1440]

        TinsUPSafety = safety_setpoints_df["t_high"][:1440]

        TinsDOWNSafety = safety_setpoints_df["t_low"][:1440]

        TinsUPsp = ground_truth_setpoints_df["t_high"][:1440]

        TinsDOWNsp = ground_truth_setpoints_df["t_low"][:1440]

        Costs = cost[:1440]

        Prices = prices[:1440]

        Discomforts = discomfort[:1440]

        temp = OPs, Tin, Tout, Policy, TinsUPComfortBand, TinsDOWNComfortBand, TinsUPSafety, TinsDOWNSafety, TinsUPsp, TinsDOWNsp, Costs, Prices, Discomforts, events, building, zone, date
        pickle.dump(
            temp,
            open(
                "CacheThanos/" + str(building) + str(zone) + str(Date) +
                ".dat", "wb"))
        return temp

    else:
        return pickle.load(
            open(
                "CacheThanos/" + str(building) + str(zone) + str(date) +
                ".dat", "rb"))
Beispiel #16
0
def get_data(filepath):
    raw_data = get_raw_data(filepath)
    preprocessed_data = preprocess_data(raw_data)
    row_length = len(preprocessed_data[0])
    data = df(preprocessed_data)
    return data, row_length
Beispiel #17
0
    u'\N{REVERSED PRIME}',  # u'\u2035'
    u'\N{MODIFIER LETTER PRIME}',  # u'\u02b9'
    u'\N{FULLWIDTH APOSTROPHE}',  # u'\uff07'
]

DATE_ORDER_PATTERN = re.compile(
    u'([DMY])+\u200f*[-/. \t]*([DMY])+\u200f*[-/. \t]*([DMY])+')
RELATIVE_PATTERN = re.compile(r'(?<![\+\-]\s*)\{0\}')
DEFAULT_MONTH_PATTERN = re.compile(r'^M?\d+$', re.U)
RE_SANITIZE_APOSTROPHE = re.compile(u'|'.join(APOSTROPHE_LOOK_ALIKE_CHARS))
AM_PATTERN = re.compile(r'^\s*[Aa]\s*\.?\s*[Mm]\s*\.?\s*$')
PM_PATTERN = re.compile(r'^\s*[Pp]\s*\.?\s*[Mm]\s*\.?\s*$')
PARENTHESIS_PATTERN = re.compile(r'[\(\)]')

os.chdir(os.path.dirname(os.path.abspath(__file__)))
get_raw_data()

cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/"


def _filter_relative_string(relative_string):
    return (isinstance(relative_string, six.string_types)
            and RELATIVE_PATTERN.search(relative_string)
            and not PARENTHESIS_PATTERN.search(relative_string))


def _filter_month_name(month_name):
    return not DEFAULT_MONTH_PATTERN.match(month_name)


def _retrieve_locale_data(locale):
Beispiel #18
0
 def get_data(self, filepath):
     raw_data = get_raw_data(filepath)
     preprocessed_data = preprocess_data(raw_data)
     self.row_length = len(preprocessed_data[0])
     self.data = df(preprocessed_data)