コード例 #1
0
ファイル: loader.py プロジェクト: diffeo/Datawake
def upload_file(*args, **kwargs):
    domain_content_connector = factory.get_entity_data_connector()
    try:
        domain_file = kwargs.get("file_upload")
        domain_name = kwargs.get("name")
        domain_description = kwargs.get("description")
        if not db.domain_exists(domain_name):
            if domain_file is not None:
                tangelo.log("read domain file")
                domain_file_lines = domain_file.file.readlines()
                domain_file_lines = map(lambda x: x.strip().replace('\0',''), domain_file_lines)
                db.add_new_domain(domain_name, domain_description)
                rowkeys = []
                for line in domain_file_lines:
                    i = line.index(',')   # split on the first comma
                    type = line[:i]
                    value = line[i+1:]
                    if type[0] == '"' and type[len(type)-1] == '"': type = type[1:-1]
                    if value[0] == '"' and value[len(value)-1] == '"': value = value[1:-1]
                    rowkeys.append( domain_name+'\0'+type+'\0'+value )
                result = domain_content_connector.add_new_domain_items(rowkeys)
                return json.dumps(dict(success=result))
            else:
                return json.dumps(dict(success=False))
        else:
            return json.dumps(dict(success=False))
    finally:
        domain_content_connector.close()
コード例 #2
0
def delete_domain(domain_name):
    if db.domain_exists(domain_name):
        domain_content_connector = factory.get_entity_data_connector()
        db.remove_domain(domain_name)
        domain_content_connector.delete_domain_items(domain_name)
        return json.dumps(dict(success=True))
    return json.dumps(dict(success=False))
コード例 #3
0
def upload_database(domain_name, domain_description):
    if not db.domain_exists(domain_name):
        db.add_new_domain(domain_name, domain_description)
        kwargs = dict(domain_name=domain_name, domain_description=domain_description)
        database_upload_thread = threading.Thread(target=upload_database_threaded, kwargs=kwargs)
        database_upload_thread.daemon = True
        database_upload_thread.start()
        return json.dumps(dict(success=True))
    return json.dumps(dict(success=False))
コード例 #4
0
ファイル: loader.py プロジェクト: diffeo/Datawake
def delete_domain(*args, **kwargs):
    domain_name = kwargs.get("domain_name")
    for key in kwargs.keys():
        tangelo.log(key)
    if db.domain_exists(domain_name):
        domain_content_connector = factory.get_entity_data_connector()
        db.remove_domain(domain_name)
        domain_content_connector.delete_domain_items(domain_name)
        return json.dumps(dict(success=True))
    else:
        return json.dumps(dict(success=False))
コード例 #5
0
ファイル: loader.py プロジェクト: diffeo/Datawake
def upload_database(*args, **kwargs):
    domain_name = kwargs.get("domain_name")
    domain_description = kwargs.get("domain_description")
    if not db.domain_exists(domain_name):
        db.add_new_domain(domain_name, domain_description)
        database_upload_thread = threading.Thread(target=upload_database_threaded, kwargs=kwargs)
        database_upload_thread.daemon = True
        database_upload_thread.start()
        return {'success': True}
    else:
        return {'success': False}
コード例 #6
0
ファイル: domaingen.py プロジェクト: brandontheis/Datawake
def get(domain,trail,stars,newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,'auto generated domain from trail: '+trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail


    for (url,rank) in db.getRankedUrls(org,trail,domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org,trail)
        for url in urls:
           url_set.add(url)


    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type,value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)


    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(url_set)
    for url,featureDict in all_entities.iteritems():
        for type,values in featureDict.iteritems():
            type = type.replace(',',' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',',' ')
                    features.add(type+"\0"+value)



    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',',' ')
            value = featureObj['value'].replace(',',' ')
            features.add(type+"\0"+value)





    entity_data_connector.add_new_domain_items( map(lambda x: newdomain+'\0'+x,features))
コード例 #7
0
ファイル: domaingen.py プロジェクト: nagyistge/Datawake
def get(domain, trail, stars, newdomain):
    org = helper.get_org().upper()

    if not db.domain_exists(newdomain):
        db.add_new_domain(newdomain,
                          'auto generated domain from trail: ' + trail)

    features = set([])
    url_set = set([])
    stars = int(stars)
    # get all stared urls for the trail

    for (url, rank) in db.getRankedUrls(org, trail, domain):
        url_set.add(url)

    if stars < 1:
        urls = db.getBrowsePathUrls(org, trail)
        for url in urls:
            url_set.add(url)

    # get the list of invalid entities for the domain
    markedEntities = set([])
    for (type, value) in db.get_marked_entities_for_domain(org, domain):
        markedEntities.add(value)

    # for each url get all extracted entities
    entity_data_connector = factory.get_entity_data_connector()
    all_entities = entity_data_connector.get_extracted_entities_from_urls(
        url_set)
    for url, featureDict in all_entities.iteritems():
        for type, values in featureDict.iteritems():
            type = type.replace(',', ' ')
            filtered_values = []
            for value in values:
                if value not in markedEntities:
                    value = value.replace(',', ' ')
                    features.add(type + "\0" + value)

    # for each url get any manually extracted entities
    for url in url_set:
        for featureObj in db.get_feedback_entities(org, domain, url):
            type = featureObj['type'].replace(',', ' ')
            value = featureObj['value'].replace(',', ' ')
            features.add(type + "\0" + value)

    entity_data_connector.add_new_domain_items(
        map(lambda x: newdomain + '\0' + x, features))
コード例 #8
0
def upload_file(file_upload, name, description):
    tangelo.log("Loading new domain: "+name)
    domain_content_connector = factory.get_entity_data_connector()
    try:
        if not db.domain_exists(name):
            if file_upload is not None:
                domain_file_lines = file_upload.file.readlines()
                domain_file_lines = map(lambda x: x.strip().replace('\0', ''), domain_file_lines)
                db.add_new_domain(name, description)
                rowkeys = []
                for line in domain_file_lines:
                    i = line.index(',')  # split on the first comma
                    type = line[:i]
                    value = line[i + 1:]
                    if type[0] == '"' and type[len(type) - 1] == '"': type = type[1:-1]
                    if value[0] == '"' and value[len(value) - 1] == '"': value = value[1:-1]
                    rowkeys.append("%s\0%s\0%s" % (name, type, value))
                result = domain_content_connector.add_new_domain_items(rowkeys)
                return json.dumps(dict(success=result))
            return json.dumps(dict(success=False))
        return json.dumps(dict(success=False))
    finally:
        domain_content_connector.close()