Example #1
0
    def __init__(self,
                 nmembers,
                 npolicies,
                 ribdump,
                 rnd_policies=True,
                 member_cap=None,
                 path_templates=None):
        self.nmembers = nmembers
        self.npolicies = npolicies
        self.ribdump = ribdump
        self.rnd_policies = rnd_policies
        if path_templates:
            self.path_templates = path_templates
        else:
            self.path_templates = "templates/"

        if member_cap:
            self.member_cap = member_cap
        else:
            self.member_cap = self.nmembers

        self.update_template = util.load_json_file(self.path_templates +
                                                   "update.json")
        self.sdx_template = util.load_json_file(self.path_templates +
                                                "sdx.json")
        self.route_set = self.parse_routes()
        self.members = self.gen_ixp_members()
        self.gen_members_policies()
    def __init__(self, dir_path):
        '''
        Constructor
        Inputs:
            dir_path: (string) path to the directory that contains the
              file

        Initializing five public variables:
        name: name of dataset
        predictor_vars: list of all predictor variables
        dependent_var: dependent variable
        labels: label of predictor variables and dependent variable
        data: a list with two elements, the first being the training data 
        and the second being the testing data
        '''

        # REPLACE pass WITH YOUR CODE

        # Read CVS and JSON files
        data = util.load_numpy_array(dir_path, "data.csv")
        parameters = util.load_json_file(dir_path, "parameters.json")
        # Initializing attributes
        self.name = parameters["name"]
        self.predictor_vars = parameters["predictor_vars"]
        self.dependent_var = parameters["dependent_var"]
        self.labels = data[0]
        self.data = train_test_split(data[1], train_size = parameters["training_fraction"],\
            test_size = None, random_state = parameters["seed"])
Example #3
0
def setup_avatar_db(conn, adb):
    print("Creating the user avatar database")
    adb_url = '/' + adb
    conn.request("PUT", adb_url, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 201:
        print("User Avatar database created.")
    elif resp.getcode() == 409 or resp.getcode() == 412:
        print("Avatar database already exists.")
    else:
        print("Error creating avatar database.")
    # Now save the auth document
    auth_url = adb_url + '/_design/_auth'
    conn.request("GET", auth_url, headers=gh)
    resp = conn.getresponse()
    addoc = util.load_json_file(
        os.path.join(wf_dir, 'scripts/ddoc/avatar_auth.json'))
    addoc_old = util.decode_response(resp)
    if resp.getcode() == 200:
        print("Avatar auth doc already exists.  Updating.")
        addoc['_rev'] = addoc_old['_rev']
    req_body = json.dumps(addoc)
    conn.request("PUT", auth_url, body=req_body, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("Avatar auth doc saved successfully.")
    else:
        print("Avatar auth doc save failed.")
Example #4
0
def setup_user_db(conn):
    # Now we'll set up the CouchDB user database
    print("\nSetting user database public fields in CouchDB")
    # First, set the user public fields in the CouchDB config
    url = '/_config/couch_httpd_auth/public_fields'
    field = "\"userPublic\""
    conn.request("PUT", url, body=field, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("User config updated successfully")
    else:
        print("User config update failed!")
    # Now, set up some views in the user database
    url = '/_users/_design/user_queries'
    # Get the user design doc, if it exists
    conn.request("GET", url, headers=gh)
    resp = conn.getresponse()
    old_ddoc = util.decode_response(resp)
    user_ddoc = util.load_json_file(
        os.path.join(wf_dir, "scripts/ddoc/user_ddoc.json"))
    if resp.getcode() != 404:
        user_ddoc['_rev'] = old_ddoc['_rev']
    req_body = json.dumps(user_ddoc)
    conn.request("PUT", url, body=req_body, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("User design doc saved successfully.")
    else:
        print("User design doc save failed.")
Example #5
0
def setup_avatar_db(conn, adb):
    print("Creating the user avatar database")
    adb_url = "/" + adb
    conn.request("PUT", adb_url, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 201:
        print("User Avatar database created.")
    elif resp.getcode() == 409 or resp.getcode() == 412:
        print("Avatar database already exists.")
    else:
        print("Error creating avatar database.")
    # Now save the auth document
    auth_url = adb_url + "/_design/_auth"
    conn.request("GET", auth_url, headers=gh)
    resp = conn.getresponse()
    addoc = util.load_json_file(os.path.join(wf_dir, "scripts/ddoc/avatar_auth.json"))
    addoc_old = util.decode_response(resp)
    if resp.getcode() == 200:
        print("Avatar auth doc already exists.  Updating.")
        addoc["_rev"] = addoc_old["_rev"]
    req_body = json.dumps(addoc)
    conn.request("PUT", auth_url, body=req_body, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("Avatar auth doc saved successfully.")
    else:
        print("Avatar auth doc save failed.")
Example #6
0
def setup_user_db(conn):
    # Now we'll set up the CouchDB user database
    print("\nSetting user database public fields in CouchDB")
    # First, set the user public fields in the CouchDB config
    url = "/_config/couch_httpd_auth/public_fields"
    field = '"userPublic"'
    conn.request("PUT", url, body=field, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("User config updated successfully")
    else:
        print("User config update failed!")
    # Now, set up some views in the user database
    url = "/_users/_design/user_queries"
    # Get the user design doc, if it exists
    conn.request("GET", url, headers=gh)
    resp = conn.getresponse()
    old_ddoc = util.decode_response(resp)
    user_ddoc = util.load_json_file(os.path.join(wf_dir, "scripts/ddoc/user_ddoc.json"))
    if resp.getcode() != 404:
        user_ddoc["_rev"] = old_ddoc["_rev"]
    req_body = json.dumps(user_ddoc)
    conn.request("PUT", url, body=req_body, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("User design doc saved successfully.")
    else:
        print("User design doc save failed.")
Example #7
0
def parse_json_file(_filename, _already):
    sellers_set = set()

    data = util.load_json_file(_filename)

    for good in data:
        seller = good['seller']

        print(seller)
        print(_filename)
        print(good)

        s = seller.split('/')
        if len(s) == 0:
            continue
        s = s[-1][2:]
        if not s:
            continue
        seller_id = int(s)
        print(seller_id)

        unique = False
        if seller_id not in sellers_set:
            unique = True

        if seller_id not in _already:
            sellers_set.add(seller_id)
        else:
            print(seller_id, "is already in file")
        # use vk api to load user information
        # only if this id is unique

    return sellers_set
Example #8
0
def test_pull_request_created(app):
    headers = {
        'X-Request-UUID': 'afe23a8c-dde6-4cde-8eaa-3e50077849f4',
        'X-Event-Key': 'pullrequest: created',
        'X-Event-Time': 'Wed, 10 Jul 2019 20: 23: 28 GMT',
        'X-Attempt-Number': '1',
        'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8',
        'User-Agent': 'Bitbucket-Webhooks/2.0',
        'Content-Type': 'application/json'
    }

    event = load_json_file('./tests/fixtures/pullrequest-created.json')
    expected_response = load_json_file(
        './tests/responses/pullrequest-created.json')
    res = handle_bitbucket_event(event, headers)
    response = json.loads(res)
    assert expected_response == response
Example #9
0
def test_pull_request_fulfilled(app):
    headers = {
        'X-Request-UUID': 'a607b1c4-be59-4a27-83e5-208de2fa7e81',
        'X-Event-Key': 'pullrequest: fulfilled',
        'X-Event-Time': 'Wed, 10 Jul 2019 21: 41: 04 GMT',
        'X-Attempt-Number': '1',
        'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8',
        'User-Agent': 'Bitbucket-Webhooks/2.0',
        'Content-Type': 'application/json'
    }

    event = load_json_file('./tests/fixtures/pullrequest-fulfilled.json')
    expected_response = load_json_file(
        './tests/responses/pullrequest-fulfilled.json')
    res = handle_bitbucket_event(event, headers)
    response = json.loads(res)
    assert expected_response == response
Example #10
0
def test_pull_request_rejected(app):
    headers = {
        'X-Request-UUID': 'a391cc1e-c057-4168-b5fd-2e52b911d5fd',
        'X-Event-Key': 'pullrequest: rejected',
        'X-Event-Time': 'Wed, 10 Jul 2019 20: 23: 44 GMT',
        'X-Attempt-Number': '1',
        'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8',
        'User-Agent': 'Bitbucket-Webhooks/2.0',
        'Content-Type': 'application/json'
    }

    event = load_json_file('./tests/fixtures/pullrequest-rejected.json')
    expected_response = load_json_file(
        './tests/responses/pullrequest-rejected.json')
    res = handle_bitbucket_event(event, headers)
    response = json.loads(res)
    assert expected_response == response
Example #11
0
def test_commit_status_updated_failed(app):
    headers = {
        'X-Request-UUID': '1450daa7-5036-4b25-b24d-13fe76363b25',
        'X-Event-Key': 'repo:commit_status_updated',
        'X-Event-Time': 'Thu, 11 Jul 2019 14: 36:20 GMT',
        'X-Attempt-Number': '1',
        'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8',
        'User-Agent': 'Bitbucket-Webhooks/2.0',
        'Content-Type': 'application/json'
    }

    event = load_json_file(
        './tests/fixtures/commit-status-updated-failed.json')
    expected_response = load_json_file(
        './tests/responses/commit-status-updated-failed.json')
    res = handle_bitbucket_event(event, headers)
    response = json.loads(res)
    assert expected_response == response
Example #12
0
def test_commit_status_updated_successful(app):
    headers = {
        'X-Request-UUID': '01e7f365-6430-4a79-bd5a-976acc8e228e',
        'X-Event-Key': 'repo:commit_status_updated',
        'X-Event-Time': 'Thu, 11 Jul 2019 15: 01: 11 GMT',
        'X-Attempt-Number': '1',
        'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8',
        'User-Agent': 'Bitbucket-Webhooks/2.0',
        'Content-Type': 'application/json'
    }

    event = load_json_file(
        './tests/fixtures/commit-status-updated-successful.json')
    expected_response = load_json_file(
        './tests/responses/commit-status-updated-successful.json')
    res = handle_bitbucket_event(event, headers)
    response = json.loads(res)
    assert expected_response == response
Example #13
0
 def __init__(self, dir_path):
     '''
     Constructor
     Inputs:
         dir_path: (string) path to the directory that contains the
           file
     '''
     labels, self.csv = util.load_numpy_array(dir_path, "data.csv")
     json_full = util.load_json_file(dir_path, "parameters.json")
     self.name = json_full['name']
     self.predictor_vars = json_full['predictor_vars']
     self.dependent_var = json_full['dependent_var']
     self.training_fraction = json_full['training_fraction']
     self.seed = json_full['seed']
Example #14
0
    def __init__(self, dir_path):
        '''
        Constructor
        Inputs:
            dir_path: (string) path to the directory that contains the
              file
        '''

        # REPLACE pass WITH YOUR CODE
        self.data = util.load_numpy_array(dir_path, 'data.csv')
        self.parameters = util.load_json_file(dir_path, 'parameters.json')

        self.training_data, self.testing_data = train_test_split(
            self.data[1],
            test_size=(1 - self.parameters['training_fraction']),
            random_state=self.parameters['seed'])
Example #15
0
 def __init__(self, dir_path):
     '''
     Constructor
     Inputs:
         dir_path: (string) path to the directory that contains the
           file
     '''
     self.dir_path = dir_path
     params_dict = util.load_json_file(self.dir_path, 'parameters.json')
     self.label, data = util.load_numpy_array(self.dir_path, 'data.csv')
     self.pred_vars = params_dict['predictor_vars']
     self.dep_var = params_dict['dependent_var']
     self.X_train, self.X_test, self.y_train, self.y_test = \
         train_test_split(data[:,self.pred_vars], data[:,self.dep_var],
                         train_size = params_dict['training_fraction'],
                         random_state = params_dict['seed'])
Example #16
0
def prepare(study, portfolio, remote):
    studyParams = util.load_json_file("study/%s.json" % study)
    
    search = build_search(study, portfolio, studyParams)
    logging.info("Caching %s-%s/search" % (study, portfolio))
    cache.put("batch/%s-%s/search" % (study, portfolio), search, remote)

    batch_ = search['batch_']
    target_ = search['target_']
    value__ = search['value__']
    for batch, value_ in zip(batch_, value__):
        params = copy.deepcopy(studyParams)
        del params['shift']
        params['portfolioKey'] = "portfolio/%s" % portfolio
        apply_search(params, target_, value_)
        params['episodes'].update(epi.build_episodes(params['episodes']))
        logging.info("Caching %s" % batch)
        cache.put("batch/%s/params" % batch, params, remote)
def update_url_id(message_file_name, directory):
    message_id = message_file_name.split('.')[0]
    print(message_id)
    r = tg.call_method('getMessageLink',
                       params={
                           'chat_id': chat_id,
                           'message_id': message_id
                       })
    r.wait()
    if not r.update:
        return
    if 'url' not in r.update:
        return
    url_id = r.update['url'].split('/')[-1]
    print('https://t.me/cyclingmarket/{}'.format(url_id))
    full_path = os.path.join(directory, message_file_name)
    data = util.load_json_file(full_path)
    data['url_id'] = url_id
    util.save_json_file(full_path, data)
Example #18
0
def http_put(ctl, args):
    if args['--payload']:
        payload = args['<payload>']
    elif args['--file']:
        payload = util.load_json_file(args['<file>'])
    status = OperStatus()
    headers = {'content-type': 'application/yang.data+json',
               'accept': 'text/json, text/html, application/xml, */*'}
    template_url = "http://{}:{}/restconf/{}"
    url = template_url.format(ctl.ipAddr, ctl.portNum, args['<resource>'])
    resp = ctl.http_put_request(url, json.dumps(payload), headers)
    if(resp is None):
        status.set_status(STATUS.CONN_ERROR)
    elif(resp.content is None):
        status.set_status(STATUS.CTRL_INTERNAL_ERROR)
    elif(resp.status_code == 200 or resp.status_code == 204):
        status.set_status(STATUS.OK)
    else:
        status.set_status(STATUS.HTTP_ERROR, resp)
    return Result(status, resp)
Example #19
0
            print(seller_id, "is already in file")
        # use vk api to load user information
        # only if this id is unique

    return sellers_set


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("give me directory with .json files with messages from channel")
        exit(-1)

    already_got_sellers_set = set()
    if len(sys.argv) == 3:
        sellers_json_input = sys.argv[2]
        sellers = util.load_json_file(sellers_json_input)
        if sellers:
            for s in sellers:
                already_got_sellers_set.add(s['id'])

    dir_name = sys.argv[1]
    json_files = glob.glob(os.path.join(dir_name, "messages*.json"))
    print(json_files)

    sellers_id_set = set()

    # f = dir_name + "messages.json"
    for f in json_files:
        sellers_id_set.update(parse_json_file(f, already_got_sellers_set))

    # print(sellers_id_set)
    'с-пб', 'питер', 'санкт-петербург(доставказавашсчёт)',
    'спбплощадьвосстания', 'санк-петербург',
    'санкт-петербургдоставкакомпаниейсдэкзавашсчетпороссиииснг',
    'санктпетербург', 'спб', 'spb', 'петербург', 'санкт-петербург',
    'веледетвспбсдэком(пересылвмскзавашсчет)ценазастоковуюкомплектациюгайз',
    'санкт-петербург+почта'
]

MOSCOW = ['москва-красногорск', 'москва', 'мск']

if __name__ == "__main__":
    processed = os.listdir(PROCESSED_DIR)
    cities = set()
    for post_id in processed:
        filename = os.path.join(PROCESSED_DIR, post_id, 'data.json')
        json = util.load_json_file(filename)
        if isinstance(json['city'], dict):
            continue
        city_clear = clear_city_string(json['city'])
        cities.add(city_clear)

        if city_clear in SPB:
            json['city'] = {'id': 2, 'text': json['city']}
        if city_clear in MOSCOW:
            json['city'] = {'id': 1, 'text': json['city']}
        if city_clear == 'ростов-на-дону':
            json['city'] = {'id': 119, 'text': json['city']}
        if city_clear == 'великийновгород':
            json['city'] = {'id': 35, 'text': json['city']}
        if city_clear == 'вологда':
            json['city'] = {'id': 41, 'text': json['city']}
Example #21
0
def setup_main_db(conn, main_db):

    write_role = main_db + ":write"
    # This here is the validation function to control writing to the main database
    validation_func = (
        """
    function(newDoc, oldDoc, userCtx){
        if((userCtx.roles.indexOf("%s") === -1) &&
            (userCtx.roles.indexOf("admin") === -1) &&
            (userCtx.roles.indexOf("master") === -1) &&
            (userCtx.roles.indexOf("_admin") === -1)){
                throw({forbidden: "Not authorized"}); }
    }
    """
        % write_role
    )

    auth_doc = dict()
    auth_doc["_id"] = "_design/_auth"
    auth_doc["validate_doc_update"] = validation_func

    # Create the main database
    print("Creating the main Wikifeat database")
    main_db_url = "/" + main_db
    conn.request("PUT", main_db_url, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 201:
        print("Main database created.")
    elif resp.getcode() == 409 or resp.getcode() == 412:
        print("Main database already exists.")
    else:
        print("Error occurred.")
        sys.exit(-1)
    # Save the auth document
    auth_url = main_db_url + "/_design/_auth"
    conn.request("GET", auth_url, headers=gh)
    resp = conn.getresponse()
    addoc = util.decode_response(resp)
    req_body = ""
    if resp.getcode() == 404:
        req_body = json.dumps(auth_doc)
    elif resp.getcode() == 200:
        addoc["validate_doc_update"] = validation_func
        req_body = json.dumps(addoc)
    if len(req_body) > 1:
        conn.request("PUT", auth_url, body=req_body, headers=ph)
        resp = conn.getresponse()
        util.decode_response(resp)
        if resp.getcode() == 201:
            print("Main auth doc successfully updated.")
        else:
            print("Main auth doc update failed.")
    # Now load the main db security document
    sec_url = main_db_url + "/_security"
    main_sec = util.load_json_file(os.path.join(wf_dir, "scripts/ddoc/main_access.json"))
    req_body = json.dumps(main_sec)
    conn.request("PUT", sec_url, body=req_body, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("Main security doc saved successfully.")
    else:
        print("Main security doc save failed.")
    # Now save the main db design doc
    main_ddoc_url = main_db_url + "/_design/wiki_query"
    conn.request("GET", main_ddoc_url, headers=gh)
    resp = conn.getresponse()
    existing_ddoc = util.decode_response(resp)
    main_ddoc = util.load_json_file(os.path.join(wf_dir, "scripts/ddoc/main_ddoc.json"))
    if resp.getcode() == 200:
        # Set the rev so we can update
        print("Main design doc exists.  Updating.")
        main_ddoc["_rev"] = existing_ddoc["_rev"]
    req_body = json.dumps(main_ddoc)
    conn.request("PUT", main_ddoc_url, body=req_body, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("Main design doc saved successfully")
    else:
        print("Main design doc save failed")
Example #22
0
def setup_main_db(conn, main_db):

    write_role = main_db + ":write"
    # This here is the validation function to control writing to the main database
    validation_func = """
    function(newDoc, oldDoc, userCtx){
        if((userCtx.roles.indexOf("%s") === -1) &&
            (userCtx.roles.indexOf("admin") === -1) &&
            (userCtx.roles.indexOf("master") === -1) &&
            (userCtx.roles.indexOf("_admin") === -1)){
                throw({forbidden: "Not authorized"}); }
    }
    """ % write_role

    auth_doc = dict()
    auth_doc["_id"] = "_design/_auth"
    auth_doc["validate_doc_update"] = validation_func

    # Create the main database
    print("Creating the main Wikifeat database")
    main_db_url = '/' + main_db
    conn.request("PUT", main_db_url, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 201:
        print("Main database created.")
    elif resp.getcode() == 409 or resp.getcode() == 412:
        print("Main database already exists.")
    else:
        print("Error occurred.")
        sys.exit(-1)
    # Save the auth document
    auth_url = main_db_url + '/_design/_auth'
    conn.request("GET", auth_url, headers=gh)
    resp = conn.getresponse()
    addoc = util.decode_response(resp)
    req_body = ""
    if resp.getcode() == 404:
        req_body = json.dumps(auth_doc)
    elif resp.getcode() == 200:
        addoc['validate_doc_update'] = validation_func
        req_body = json.dumps(addoc)
    if len(req_body) > 1:
        conn.request("PUT", auth_url, body=req_body, headers=ph)
        resp = conn.getresponse()
        util.decode_response(resp)
        if resp.getcode() == 201:
            print("Main auth doc successfully updated.")
        else:
            print("Main auth doc update failed.")
    # Now load the main db security document
    sec_url = main_db_url + '/_security'
    main_sec = util.load_json_file(
        os.path.join(wf_dir, "scripts/ddoc/main_access.json"))
    req_body = json.dumps(main_sec)
    conn.request("PUT", sec_url, body=req_body, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("Main security doc saved successfully.")
    else:
        print("Main security doc save failed.")
    # Now save the main db design doc
    main_ddoc_url = main_db_url + '/_design/wiki_query'
    conn.request("GET", main_ddoc_url, headers=gh)
    resp = conn.getresponse()
    existing_ddoc = util.decode_response(resp)
    main_ddoc = util.load_json_file(
        os.path.join(wf_dir, "scripts/ddoc/main_ddoc.json"))
    if resp.getcode() == 200:
        # Set the rev so we can update
        print("Main design doc exists.  Updating.")
        main_ddoc['_rev'] = existing_ddoc['_rev']
    req_body = json.dumps(main_ddoc)
    conn.request("PUT", main_ddoc_url, body=req_body, headers=ph)
    resp = conn.getresponse()
    util.decode_response(resp)
    if resp.getcode() == 200 or resp.getcode() == 201:
        print("Main design doc saved successfully")
    else:
        print("Main design doc save failed")
Example #23
0
import pandas
from util import load_json_file, get_data_with_code, load_csv, get_value_from_json
from nlp import preprocess, get_jaccard_sim, get_intersection
import spacy
import numpy as np
nlp = spacy.load('en')

df = load_csv("data/output.csv", "|")
# print(df)

industry_json = load_json_file("sasb_mm_industry.json")
threat_json = load_json_file("sasb_mm_threats.json")

sub_df, sec_data_list = get_data_with_code("sasb", df,
                                           "Internet Media & Services")
# print(sec_data_list[0])
text_data = []

for data in sec_data_list:
    tokens = preprocess(data)
    # print(tokens)
    text_data.append(tokens)

threat_desc = []
threat_name = []

for threat in threat_json:
    # print(threat["Threat"])
    for obj in threat["SubThreats"]:
        # doc2 = nlp(obj["Description"])
        # print(obj["SubThreat"])
Example #24
0
def main(args):
    try:
        logging.info('(function {}) Started'.format(main.__name__))

        source_files = UTIL.parse_source_files(args.data_path, args.from_files, logging)
        source_file = source_files['source']
        destination_file = os.path.join(args.data_path, args.from_format.lower() + '_to_' + args.to_format.lower() + '_'+args.to_file_name)

        # TODO: 1) We need to create a interface class to have the same signature for all the formatters in ds_formatter folder.
        # TODO: 2) We need to create a generic approach to convert any type to any type not only any type to squad.
        # TODO: 3) can we have better approach to handle the following if/else scenarios
        # TODO: 4) we may also put some kind of field wrapper to handle whether which fields are gonna be filled with dummy and which fields are gonna be filled with real values.
        if args.from_format.lower() == 'qangaroo' and args.to_format.lower() == 'squad' :
            """            
            --log_path="~/log.log" 
            --data_path="~/data/qangaroo_v1.1/wikihop" 
            --from_files="source:dev.json"
            --from_format="qangaroo" 
            --to_format="squad" 
            --to_file_name="dev.json" #it is gonna be renamed as "[from_to]_filename.what"
            """
            in_content = UTIL.load_json_file(source_file, logging)
            formatted_content = qangaroo.convert_to_squad(in_content)
            UTIL.dump_json_file(destination_file, formatted_content, logging)

        elif args.from_format.lower() == 'mctest' and args.to_format.lower() == 'squad':
            """            
            --log_path="~/log.log" 
            --data_path="~/data/" 
            --from_files="source:mc160.dev.tsv" 
            --from_format="mctest" 
            --to_format="squad" 
            --to_file_name="mc160.dev.json" #it is gonna be renamed as "[from_to]_filename.what"
            """


            story_question_content = UTIL.load_csv_file(source_file,"\t", None, logging)
            #answer_content = UTIL.load_csv_file(additional_files['answer'], "\t", None, logging)
            formatted_content = mctest.convert_to_squad(story_question_content)
            UTIL.dump_json_file(destination_file, formatted_content, logging)

        elif args.from_format.lower() == 'insuranceqa' and args.to_format.lower() == 'squad':
            """            
            --log_path="~/log.log" 
            --data_path="~/data/insuranceqa_v2" 
            --from_files="source:InsuranceQA.question.anslabel.token.1500.pool.solr.test.encoded,voc:vocabulary.txt,answer:InsuranceQA.label2answer.token.encoded"
            --from_format="insuranceqa" 
            --to_format="squad" 
            --to_file_name="1500.test.json"
            """

            voc = insuranceqa.load_vocab(source_files['voc'])
            questions, a_to_q_map = insuranceqa.load_questions(source_file, voc)
            answers = insuranceqa.load_answers(source_files['answer'], voc)
            formatted_content = insuranceqa.convert_to_squad(questions, answers, a_to_q_map)
            UTIL.dump_json_file(destination_file, formatted_content, logging)

        elif args.from_format.lower() == 'triviaqa' and args.to_format.lower() == 'squad':
            """            
            --log_path="~/log.log" 
            --data_path="~/data/triviaqa/" 
            --from_files=""source:qa/wikipedia-train.json, wikipedia:evidence/wikipedia,web:evidence/web,seed:10,token_size:2000,sample_size:1000000"
            --from_format="triviaqa" 
            --to_format="squad" 
            --to_file_name="wikipedia-train-long.json"
            """

            wiki = source_files['wikipedia']
            web = source_files['web']
            seed = source_files['seed']
            max_num_of_tokens = source_files['token_size']
            sample_size = source_files['sample_size']
            qa_file = UTIL.load_json_file(source_file, logging)
            formatted_content = triviaqa.convert_to_squad_format(qa_file, wiki, web, sample_size, seed, max_num_of_tokens)
            UTIL.dump_json_file(destination_file, formatted_content, logging)
        elif args.from_format.lower() == 'wikiqa' and args.to_format.lower() == 'squad':
            """            
            --log_path="~/log.log" 
            --data_path="~/data/WikiQACorpus" 
            --from_files="source:WikiQA-dev.tsv"
            --from_format="wikiqa" 
            --to_format="squad" 
            --to_file_name="dev.json"
            """

            story_question_content = UTIL.load_csv_file(source_file, "\t", 'infer', logging)
            formatted_content = wikiqa.convert_to_squad(story_question_content)
            UTIL.dump_json_file(destination_file, formatted_content, logging)

        elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'matchzoo':
            """       
            **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt".     
            --log_path="~/log.log" 
            --data_path="~/data/squad" 
            --from_files="source:dev-v1.1.json,q_len:1000,negative_sampling:100"
            --from_format="squad" 
            --to_format="matchzoo" 
            --to_file_name="dev.txt"
            """
            negative_samp_count = int(source_files['negative_sampling'])
            q_len = int(source_files['q_len'])
            content = UTIL.load_json_file(source_file, logging)
            generator = squad.yield_to_matchzoo(content, q_len, negative_samp_count)
            open(destination_file, "w").write('\n'.join(data for data in generator))

            #UTIL.dump_json_file(destination_file, formatted_content, logging)
        elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'lucene':
            """       
            **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt".     
            --log_path="~/log.log" 
            --data_path="~/data/squad" 
            --from_files="source:dev-v1.1.json,doc_type_verbose:2"
            --from_format="squad" 
            --to_format="matchzoo" 
            --to_file_name="dev.txt"
            """
            doc_type_verbose = int(source_files['doc_type_verbose'])
            content = UTIL.load_json_file(source_file, logging)
            squad.convert_to_lucene(content, doc_type_verbose, args.data_path)
        elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'short_squad':
            """       
            **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt".     
            --log_path="~/log.log" 
            --data_path="~/data/squad" 
            --from_files="source:dev-v1.1.json,q_len:1000,negative_sampling:100"
            --from_format="squad" 
            --to_format="short_squad" 
            --to_file_name="dev.json"
            """
            negative_samp_count = int(source_files['negative_sampling'])
            q_len = int(source_files['q_len'])
            content = UTIL.load_json_file(source_file, logging)
            formatted_content = squad.convert_to_short_squad(content, q_len, negative_samp_count)
            UTIL.dump_json_file(destination_file, formatted_content, logging)
        elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'squad':
            """       
               In order to make some analyzes.      
              --log_path="~/log.log" 
              --data_path="~/data/squad" 
              --from_files="source:dev-v1.1.json,is_histogram:True,document_type:1" #1 for question, #2 for paragraphs, #3 for both.
              --from_format="squad" 
              --to_format="squad" 
              --to_file_name="dev.json"
            """
            is_historgram = source_files['is_histogram']
            document_type = int(source_files['document_type'])
            his_bin = int(source_files['histogram_bin'])
            content = UTIL.load_json_file(source_file, logging)
            squad.print_statistics(content, is_historgram, his_bin, document_type)

        elif args.from_format.lower() == 'narrativeqa' and args.to_format.lower() == 'squad':
            """            
            --log_path="~/log.log" 
            --data_path="~/data/narrativeqa" 
            --from_files="source:summaries.csv,set:train,qaps:qaps.csv" 
            --from_format="narrativeqa" 
            --to_format="squad" 
            --to_file_name="train.json" #it is gonna be renamed as "[from_to]_filename.what"
            """

            story_summary_content = UTIL.load_csv_file(source_file, ",", 'infer', logging)
            question_content = UTIL.load_csv_file(source_files['qaps'], ",", 'infer', logging)
            set_type = source_files['set']
            formatted_content = narrativeqa.convert_to_squad(story_summary_content, question_content, set_type)
            UTIL.dump_json_file(destination_file, formatted_content, logging)

        elif args.from_format.lower() == 'webqa' and args.to_format.lower() == 'squad':
            " ************************************************************ "
            " *********************** ON-HOLD *****************************"
            " ************************************************************ "
            """            
            --log_path="~/log.log" 
            --data_path="~/data/" 
            --from_files="label:question.train.token_idx.label,voc:vocabulary,answer:answers.label.token_idx" 
            --from_format="webqa" 
            --to_format="squad"
            --to_file_name="filename.what" #it is gonna be renamed as "[from_to]_filename.what" 
            """

            story_summary_content = UTIL.load_csv_file(source_file, ",", 'infer', logging)
            question_content = UTIL.load_csv_file(source_files['qaps'], ",", 'infer', logging)
            set_type = source_files['set']
            formatted_content = narrativeqa.convert_to_squad(story_summary_content, question_content, set_type)
            UTIL.dump_json_file(args.destination_file_path, formatted_content, logging)
        elif args.from_format.lower() == 'msmarco' and args.to_format.lower() == 'squad':
            """            
            --log_path="~/log.log" 
            --data_path="~/data/msmarco"
            --from_format="msmarco" 
            --to_format="squad"
            --to_file_name="dev_2.1.json" #it is gonna be renamed as "[from_to]_filename.what" 
            """
            input_dict = {}
            try:
                version = float(source_files['v'])
            except:
                version = 2.0

            input_dict['v'] = version
            if version <= 2.0:
                """
                for version <= 2.0
                --from_files="source:dev_2.1.json, v:2.0"
                """
                in_content = UTIL.load_json_file(source_file, logging)
                input_dict['story_question_content'] = in_content
                formatted_content = msmarco.convert_to_squad(in_content)
            else:
                """
                for version > 2.0
                --from_files="source:queries.train.csv,document:collection.tsv,mapping:qrels.train.csv,v:2.1,limit:-1"
                """
                queries = UTIL.load_csv_file(source_file, "\t", None, logging, ['id', 'content'])
                input_dict['queries'] = queries
                mappings = UTIL.load_csv_file(source_files['mapping'], "\t", None, logging, ['q_id', 'tmp1', 'p_id', 'tmp2'], [0,1,2,3])
                input_dict['mappings'] = mappings
                documents = UTIL.load_csv_file(source_files['document'], "\t", None, logging, ['id', 'content'])
                input_dict['documents'] = documents
                input_dict['limit'] = int(source_files['limit'])
                formatted_content = msmarco.convert_to_squad(input_dict)
            UTIL.dump_json_file(destination_file, formatted_content, logging)
        elif args.from_format.lower() == 'quasar' and args.to_format.lower() == 'squad':
            """            
            --log_path="~/log.log" 
            --data_path="~/data/quasar-t"
            --from_format="quasar-t" 
            --to_format="squad"
            --from_files="source:train_questions.json,document:train_contexts.json,type:t,is_null_tags_filter, limit:-1"
            --to_file_name="train.json"
            """
            if source_files['type'].lower() =='t':
                # quasar-t
                queries = UTIL.load_json_line_file(source_file, logging)
                documents = UTIL.load_json_line_file(source_files['document'], logging)
                formatted_content = quasar.convert_to_squad(queries, documents, source_files['is_null_tags_filter'], int(source_files['limit']))
            UTIL.dump_json_file(destination_file, formatted_content, logging)

        elif args.from_format.lower() == 'ubuntu' and args.to_format.lower() == 'squad':
            """            
            --log_path="~/log.log" 
            --data_path="~/data/ubuntu" 
            --from_files="source:valid.csv"
            --from_format="ubuntu" 
            --to_format="squad"
            --to_file_name="valid.json"
            """
            story_question_content = UTIL.load_csv_file(source_file, ",", 'infer', logging)
            formatted_content = ubuntudialogue.convert_to_squad(story_question_content)
            UTIL.dump_json_file(destination_file, formatted_content, logging)
        elif args.from_format.lower() == 'newsqa' and args.to_format.lower() == 'squad':

            """            
            --log_path="~/log.log" 
            --data_path="~/data/newsqa" 
            --from_files="source:newsqa-data-v1.csv,story:cnn_stories/"
            --from_format="newsqa" 
            --to_format="squad"
            --to_file_name="news.json"
            """

            story_question_content = UTIL.load_csv_file(source_file, ",", 'infer', logging)
            context_content_path = source_files['story']
            formatted_content = cnnnews.convert_to_squad(story_question_content, context_content_path)
            UTIL.dump_json_file(destination_file, formatted_content, logging)
        else:
            pass
        logging.info('(function {}) Finished'.format(main.__name__))
    except Exception as e:
        logging.error('(function {}) has an error: {}'.format(main.__name__, e))
        raise
Example #25
0

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("give me directory with .json files with messages from channel")
        exit(-1)

    dir_name = sys.argv[1]
    messages = glob.glob(os.path.join(dir_name, "messages*.json"))
    regexp_hash = re.compile("hash")
    messages = [x for x in messages if not regexp_hash.search(x)]
    # messages = glob.glob(os.path.join(dir_name, "messages25.json"))

    for messages_json_filename in messages:
        print(messages_json_filename)
        goods = util.load_json_file(messages_json_filename)
        goods_count = len(goods)
        i = 0
        for g in goods:
            if len(g['seller']) <= 17:
                continue
            photo_link_jpg = g['photo_link']
            photo_hash = get_photo_hash(photo_link_jpg)
            g['hash'] = photo_hash
            print(i, "/", goods_count, " ", photo_hash)
            i += 1

        json_filename = os.path.splitext(
            messages_json_filename)[0] + "_hash.json"
        util.save_json_file(json_filename, goods)
        os.remove(messages_json_filename)
def process_singles():
    singles = os.listdir(SINGLE_DIR)
    for fname in singles:
        fname = SINGLE_DIR + fname
        data = util.load_json_file(fname)

        if 'content' not in data:
            continue
        if 'caption' not in data['content']:
            continue

        content = data['content']
        caption = content['caption']
        text = caption['text']

        prod_caption_ent = None
        prod_price_ent = None
        prod_seller_ent = None
        prod_descr_ent = None
        hashtag_ents = []
        entities = caption['entities']
        for e in entities:
            entity_type = e['type']['@type']
            if entity_type == 'textEntityTypeHashtag':
                hashtag_ents.append(e)
            if entity_type == 'textEntityTypeBold':
                if not prod_caption_ent:
                    prod_caption_ent = e
                else:
                    prod_price_ent = e
            if entity_type == 'textEntityTypeItalic':
                prod_descr_ent = e
            if entity_type == 'textEntityTypeMentionName':
                prod_seller_ent = e

        if prod_caption_ent is None or prod_price_ent is None or prod_seller_ent is None or prod_descr_ent is None:
            continue

        product_hashtags = []
        for h in hashtag_ents:
            product_hashtags.append(get_from_text(text, h))
        product_caption = get_from_text(text, prod_caption_ent)
        product_descr = get_from_text(text, prod_descr_ent)
        product_price = get_from_text(text, prod_price_ent)
        product_seller_name = get_from_text(text, prod_seller_ent)
        product_city = get_city_from_text(text, prod_price_ent,
                                          prod_seller_ent)

        product_seller_id = prod_seller_ent['type']['user_id']

        photo_file_id = content['photo']['sizes'][-1]['photo']['remote']['id']

        r = tg.call_method('getUser', params={'user_id': product_seller_id})
        r.wait()
        seller = r.update

        product = {
            'hashtags': product_hashtags,
            'caption': product_caption,
            'descr': product_descr,
            'price': product_price,
            'city': product_city,
            'seller': {
                'id': product_seller_id,
                'full_name': product_seller_name,
                'username': seller['username'],
                'first_name': seller['first_name'],
                'last_name': seller['last_name'],
                'profile_photo': seller.get('profile_photo', None),
            },
            'photo': photo_file_id,
            'date': data['date']
        }

        url_id = data['url_id']
        pr_dir = os.path.join(PROCESSED_DIR, url_id)
        create_dir(pr_dir)

        util.save_json_file(os.path.join(pr_dir, 'data.json'), product)

        print(product)
Example #27
0
    connection.commit()


if __name__ == "__main__":
    if len(sys.argv) < 3:
        print(
            "give me directory with .json files (with hash) with messages from channel and channel name"
        )
        exit(-1)

    dir_name = sys.argv[1]
    tg_channel = sys.argv[2]

    sellers_filename = os.path.join(dir_name, "sellers.json")
    sellers = util.load_json_file(sellers_filename)
    sellers_to_mysql(sellers)

    exit(0)

    messages = glob.glob(os.path.join(dir_name, "messages*hash.json"))

    for messages_json_filename in messages:
        print(messages_json_filename)
        goods = util.load_json_file(messages_json_filename)
        for g in goods:
            if len(g['seller']) <= 17:
                continue
            seller_id = int(g['seller'][17:])

            description = g['description']
Example #28
0
     print "run       :  train, validate, test and report"
     print "track     :  track progress of jobs (remote only)"
     print "review    :  review performance of jobs (remote only)"
     print "dump      :  display cache item(s)"
     print "export    :  copy cache item(s) to clipboard"
     print "clear     :  clear cache item(s)"
     print "quit      :  quit"
     print "?         :  display help"
 elif (action == "portfolio"):
     portfolio = util.get_str_input("portfolio (%s) : " % portfolio, portfolio)
 elif (action == "study"):
     study = util.get_str_input("study (%s) : " % study, study)
 elif (action == "batches"):
     batches = util.get_str_input("batches (%s) : " % batches, batches)
 elif (action == "create"):
     portfolioParams = util.load_json_file("portfolio/%s.json" % portfolio)
     aPortfolio = ptf.Portfolio(portfolioParams)
     print "caching %s" % portfolio
     cache.put('portfolio/%s' % portfolio, aPortfolio, remote)
 elif (action == "remote"):
     remote = not remote
 elif (action == "debug"):
     debug = not debug
 elif (action == "pvdebug"):
     pvdebug = not pvdebug
     print pvdebug
     logging.getLogger().setLevel(level = logging.DEBUG if pvdebug else logging.INFO)
 elif (action == "prepare"):
     batcher.prepare(study, portfolio, remote)
 elif (action == "train"):
     batch_ = batcher.interpret_batches(study, portfolio, batches, remote)