def setUp(self):
     project_id = create.create_refine_project(
         TEST_DATA, 'Test project (can be safely removed).')
     # We need to reopen the project in order to force the refresh after
     # applying the JSON operations.
     server = refine.Refine(create.SERVER)
     self.project = server.open_project(project_id)
Example #2
0
def create_refine_project(path, name, pretend=False, verbose=0):
    """
    Creates a project in google Refine and loads the affiliations.
    """
    input_file = os.path.abspath(path)
    msg('Create a file that we can upload to Refine.', verbose)
    new_input_file = clean_ads_affs(input_file, verbose)
    msg('Upload to Refine.', verbose)
 
    project_name = 'Astronomy affiliations (%s) (created %s)' % (os.path.basename(path).replace('.reversed', '.merged'), time.asctime())
    print 'Creating project "%s".' % project_name

    if not pretend:
        r  = refine.Refine(SERVER)
        project = r.new_project(project_file=new_input_file,
                project_name=project_name,
                split_into_columns=True,
                separator='\t',
                ignore_initial_non_blank_lines=0,
                header_lines=1,
                skip_initial_data_rows=0,
                limit=0,
                guess_value_type=False,
                ignore_quotes=False)

        msg('Done with success.', verbose)

        return project.project_id
    def setUp(self):
        project_id = create.create_refine_project(
            TEST_DATA, 'Test project (can be safely removed).')
        # We need to reopen the project in order to force the refresh after
        # applying the JSON operations.
        server = refine.Refine(create.SERVER)
        self.project = server.open_project(project_id)

        # Perform a few edits.
        ## Modify an affiliation.
        self.project.edit(
            'New affiliation',
            'Astronomical Institute "Anton Pannekoek", University of Amsterdam, Kruislaan 403, NL--1098 SJ Amsterdam, The Netherlands',
            'Astronomical Institute "Anton Pannekoek"')
        ## Remove an affiliation.
        self.project.edit('New affiliation', 'San Cosme y Damian, Paraguay',
                          '')
        ## Modify an email.
        self.project.edit('New emails', """[u'*****@*****.**"']""",
                          "[u'*****@*****.**']")
        ## Remove an email.
        self.project.edit('New emails', "[u'*****@*****.**']", '')
        self.project.edit('New emails', "[u'*****@*****.**']",
                          '[]')

        # Grab the affiliations.
        self.affs = export.format_affiliations(project_id)
Example #4
0
 def setUp(self):
     self.server = refine.RefineServer()
     self.refine = refine.Refine(self.server)
     if self.project_file:
         self.project = self.refine.new_project(
             project_file=self.project_file,
             project_format=self.project_format,
             **self.project_options)
Example #5
0
 def set_up(self):
     self.server = refine.RefineServer()
     self.refine = refine.Refine(self.server)
     if self.project_file:
         self.project = self.refine.new_project(
             project_file=self.project_path(),
             project_format=self.project_format,
             separator='|',
             **self.project_options)
Example #6
0
def info(project_id):
    """Show project metadata"""
    projects = refine.Refine(refine.RefineServer()).list_projects().items()
    for projects_id, projects_info in projects:
        if project_id == projects_id:
            print('{0}: {1}'.format('id', projects_id))
            print('{0}: {1}'.format('name', projects_info['name']))
            print('{0}: {1}'.format('created', projects_info['created']))
            print('{0}: {1}'.format('modified', projects_info['modified']))
Example #7
0
def list_projects():
    """Query the Refine server and list projects by ID: name."""
    projects = refine.Refine(refine.RefineServer()).list_projects().items()

    def date_to_epoch(json_dt):
        """Convert a JSON date time into seconds-since-epoch."""
        return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ'))

    projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True)
    for project_id, project_info in projects:
        print('{0:>14}: {1}'.format(project_id, project_info['name']))
Example #8
0
 def connect(self):
     try:
         refine.REFINE_HOST = settings.GOOGLE_REFINE_HOST
         refine.REFINE_PORT = settings.GOOGLE_REFINE_PORT
     except AttributeError:
         raise CommandError(
             'Set GOOGLE_REFINE_HOST and GOOGLE_REFINE_PORT in settings')
     try:
         server = refine.RefineServer()
         refine_instance = refine.Refine(server)
         server.get_version()
     except URLError:
         raise CommandError('Google Refine server is not reachable.')
     return refine_instance
Example #9
0
def ls():
    """Query the server and list projects sorted by mtime."""
    projects = refine.Refine(refine.RefineServer()).list_projects().items()

    def date_to_epoch(json_dt):
        """Convert a JSON date time into seconds-since-epoch."""
        return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ'))

    projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True)
    if projects:
        for project_id, project_info in projects:
            print(u'{0:>14}: {1}'.format(project_id,
                                         project_info['name']).encode('utf-8'))
    else:
        print('Error: No projects found')
Example #10
0
    def __init__(self, source=None):
        OPENREFINE_SERVER = current_app.config.get("OPENREFINE_SERVER")
        self.refine_server = refine.Refine(server=OPENREFINE_SERVER)
        #the source object will have the project_id of OR
        #if project_id does not exist then createOR

        #check that source exists and is good
        #bad URLS and bad files are not possible

        self.source = source

        if not source.ORid:
            self.refineproj = self.createOR(source)
        else:
            self.refineproj = refine.RefineProject(server=OPENREFINE_SERVER,
                                                   project_id=str(
                                                       int(self.source.ORid)))
def latest_ast_affiliations_project_id():
    """
    Returns the project id of the latest astronomy affiliations project.
    """
    server = refine.Refine(SERVER)
    name_pattern = re.compile('affils.ast.\d{8}_\d{4}')

    latest_project = ('', None)
    for id, properties in server.list_projects().items():
        match = name_pattern.search(properties['name'])
        if match is not None:
            file_name = match.group()
            if file_name > latest_project[0]:
                latest_project = (file_name, id)
    
    print 'Extracting from project "%s".' % latest_project[1] 

    return latest_project[1]
Example #12
0
def info(project_id):
    """Show project metadata"""
    projects = refine.Refine(refine.RefineServer()).list_projects()
    if project_id in projects.keys():
        print('{0:>20}: {1}'.format('id', project_id))
        print('{0:>20}: {1}'.format(
            'url', 'http://' + refine.REFINE_HOST + ':' + refine.REFINE_PORT +
            '/project?project=' + project_id))
        for k, v in projects[project_id].items():
            if v:
                print(u'{0:>20}: {1}'.format(k, v))
        project_model = refine.RefineProject(project_id).get_models()
        columns = [c['name'] for c in project_model['columnModel']['columns']]
        for (i, v) in enumerate(columns, start=1):
            print(u'{0:>20}: {1}'.format(u'column ' + str(i).zfill(3), v))
    else:
        print('Error: No project found with id %s.\n'
              'Check existing projects with command --list' % (project_id))
Example #13
0
def testORLoad(sourceurl=None, fileobj=None):
    #download source data and save to temp file
    #add checks for a valid URL or file path

    if not sourceurl and not fileobj:
        print "You're missing the sourceurl or the fileobj"

    if sourceurl:
        res = requests.get(sourceurl)
        datatext = res.text
    elif fileobj:
        with codecs.open(fileobj, 'rb') as datafile:
            datatext = datafile.read()
    else:
        print "something went wrong with finding sourceurl or fileobj"

    filepath = os.path.join(tempfile.gettempdir(),
                            str(int(time.time())) + ".csv").replace("\\", "/")

    with codecs.open(filepath, 'wb', 'utf-8') as f:
        f.write(datatext)

    OPENREFINE_SERVER = current_app.config.get("OPENREFINE_SERVER")

    #store raw file here with barn

    try:
        refine_server = refine.Refine(server=OPENREFINE_SERVER)
        refineproj = refine_server.new_project(
            project_file=filepath,
            project_name="testerhere",
            separator=',',
            #store_blank_rows=True,
            #store_blank_cells_as_nulls=True
        )
    except Exception, e:
        print "hit error on project creation"
        print e
        os.remove(filepath)
        return False
Example #14
0
def create(project_file,
           project_format=None,
           columnWidths=None,
           encoding=None,
           guessCellValueTypes=False,
           headerLines=None,
           ignoreLines=None,
           includeFileSources=False,
           limit=None,
           linesPerRow=None,
           processQuotes=True,
           projectName=None,
           projectTags=None,
           recordPath=None,
           separator=None,
           sheets=None,
           skipDataLines=None,
           storeBlankCellsAsNulls=True,
           storeBlankRows=True,
           storeEmptyStrings=True,
           trimStrings=False):
    """Create a new project from file."""
    # guess format from file extension
    if not project_format:
        project_format = os.path.splitext(project_file)[1][1:].lower()
        if project_format == 'txt':
            try:
                columnWidths[0]
                project_format = 'fixed-width'
            except TypeError:
                project_format = 'line-based'
    # defaults for each file type
    if project_format == 'xml':
        project_format = 'text/xml'
        if not recordPath:
            recordPath = [ElementTree.parse(project_file).getroot().tag]
    elif project_format == 'csv':
        project_format = 'text/line-based/*sv'
    elif project_format == 'tsv':
        project_format = 'text/line-based/*sv'
        if not separator:
            separator = '\t'
    elif project_format == 'line-based':
        project_format = 'text/line-based'
        if not skipDataLines:
            skipDataLines = -1
    elif project_format == 'fixed-width':
        project_format = 'text/line-based/fixed-width'
        if not headerLines:
            headerLines = 0
    elif project_format == 'json':
        project_format = 'text/json'
        if not recordPath:
            recordPath = ['_', '_']
    elif project_format == 'xls':
        project_format = 'binary/text/xml/xls/xlsx'
        if not sheets:
            sheets = [0]
            # TODO: new format for sheets option introduced in OpenRefine 2.8
    elif project_format == 'xlsx':
        project_format = 'binary/text/xml/xls/xlsx'
        if not sheets:
            sheets = [0]
            # TODO: new format for sheets option introduced in OpenRefine 2.8
    elif project_format == 'ods':
        project_format = 'text/xml/ods'
        if not sheets:
            sheets = [0]
            # TODO: new format for sheets option introduced in OpenRefine 2.8
    # execute
    kwargs = {k: v for k, v in vars().items() if v is not None}
    project = refine.Refine(refine.RefineServer()).new_project(
        guess_cell_value_types=guessCellValueTypes,
        ignore_lines=ignoreLines,
        header_lines=headerLines,
        skip_data_lines=skipDataLines,
        store_blank_rows=storeBlankRows,
        process_quotes=processQuotes,
        project_name=projectName,
        store_blank_cells_as_nulls=storeBlankCellsAsNulls,
        include_file_sources=includeFileSources,
        **kwargs)
    rows = project.do_json('get-rows')['total']
    if rows > 0:
        print('{0}: {1}'.format('id', project.project_id))
        print('{0}: {1}'.format('rows', rows))
        return project
    else:
        raise Exception(
            'Project contains 0 rows. Please check --help for mandatory '
            'arguments for xml, json, xlsx and ods')
Example #15
0
# @in to:['Waldorf Astoria','Hamburg Amerika Linie','Norddeutscher Lloyd Bremen']
# @in Json_History_id
# @in function:getToValue
# @in function:getFromValue
# @out outputFile @uri file: PartTest.tsv
# @out projectID
# @out projectNoRows


# @begin CreateProject @desc create project from file
# @in csvFile @uri file: partTest.csv
# @in refinePythonFile @uri file: refine.py
# @out projectID
# @out projectNoRows
from google.refine import refine
projectID=refine.Refine(refine.RefineServer()).new_project('partTest.csv','HalfMenuDataset','.csv')[1]
# print(refine.myParser('--list'))
# @end CreateProject


'''insert a function to automatically get 'from' '''
def getFromValue(computeCluster):
    fromlist=[]
    fromlistInner=[]
    for list3 in computeCluster:
        for list4 in list3:
            fromlistInner.append(list4['value'])
        fromlist.append(fromlistInner)
        fromlistInner=[]
    return fromlist
Example #16
0
def main():
    """Command line interface."""

    # get environment variables in docker network
    docker_host = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_ADDR')
    if docker_host:
        os.environ["OPENREFINE_HOST"] = docker_host
        refine.REFINE_HOST = docker_host
    docker_port = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_PORT')
    if docker_port:
        os.environ["OPENREFINE_PORT"] = docker_port
        refine.REFINE_PORT = docker_port

    options, args = PARSER.parse_args()
    commands_dict = {
        group2_arg.dest: getattr(options, group2_arg.dest)
        for group2_arg in group2.option_list
    }
    commands_dict.update({
        group3_arg.dest: getattr(options, group3_arg.dest)
        for group3_arg in group3.option_list
    })
    commands_dict = {k: v for k, v in commands_dict.items() if v != None}
    if not commands_dict:
        PARSER.print_usage()
        return
    if options.host:
        refine.REFINE_HOST = options.host
    if options.port:
        refine.REFINE_PORT = options.port
    if args and not str.isdigit(args[0]):
        projects = refine.Refine(refine.RefineServer()).list_projects().items()
        idlist = []
        for project_id, project_info in projects:
            if args[0] == project_info['name']:
                idlist.append(str(project_id))
        if len(idlist) > 1:
            raise Exception(
                'Found at least two projects. Please specify project by id.')
        else:
            args[0] = idlist[0]

    if options.list:
        list_projects()
    if options.create:
        create_project(options)
    if options.delete:
        project = refine.RefineProject(args[0])
        project.delete()
    if options.apply:
        project = refine.RefineProject(args[0])
        response = project.apply_operations(options.apply)
        if response != 'ok':
            print >> sys.stderr, 'Failed to apply %s: %s' \
                % (options.apply, response)
        return project
    if options.export or options.output:
        project = refine.RefineProject(args[0])
        export_project(project, options)
        return project
    if options.info:
        info(args[0])
        project = refine.RefineProject(args[0])
        return project
Example #17
0
def create_project(options):
    """Create a new project from options.create file."""
    # general defaults are defined in google_refine/refine/refine.py new_project
    # additional defaults for each file type
    defaults = {}
    defaults['xml'] = {'project_format': 'text/xml', 'recordPath': 'record'}
    defaults['csv'] = {
        'project_format': 'text/line-based/*sv',
        'separator': ','
    }
    defaults['tsv'] = {
        'project_format': 'text/line-based/*sv',
        'separator': '\t'
    }
    defaults['line-based'] = {
        'project_format': 'text/line-based',
        'skipDataLines': -1
    }
    defaults['fixed-width'] = {
        'project_format': 'text/line-based/fixed-width',
        'headerLines': 0
    }
    defaults['json'] = {
        'project_format': 'text/json',
        'recordPath': ('_', '_')
    }
    defaults['xls'] = {
        'project_format': 'binary/text/xml/xls/xlsx',
        'sheets': 0
    }
    defaults['xlsx'] = {
        'project_format': 'binary/text/xml/xls/xlsx',
        'sheets': 0
    }
    defaults['ods'] = {'project_format': 'text/xml/ods', 'sheets': 0}
    # guess format from file extension (or legacy option --format)
    input_format = os.path.splitext(options.create)[1][1:].lower()
    if input_format == 'txt' and options.columnWidths:
        input_format = 'fixed-width'
    if input_format == 'txt' and not options.columnWidths:
        input_format = 'line-based'
    if options.input_format:
        input_format = options.input_format
    # defaults for selected format
    input_dict = defaults[input_format]
    # user input
    input_user = {
        group4_arg.dest: getattr(options, group4_arg.dest)
        for group4_arg in group4.option_list
    }
    input_user['strings'] = {
        k: v
        for k, v in input_user.items()
        if v != None and v not in ['true', 'false']
    }
    input_user['trues'] = {
        k: True
        for k, v in input_user.items() if v == 'true'
    }
    input_user['falses'] = {
        k: False
        for k, v in input_user.items() if v == 'false'
    }
    input_user_eval = input_user['strings']
    input_user_eval.update(input_user['trues'])
    input_user_eval.update(input_user['falses'])
    # merge defaults with user input
    input_dict.update(input_user_eval)
    input_dict['project_file'] = options.create
    refine.Refine(refine.RefineServer()).new_project(**input_dict)
Example #18
0
def main():
    """Command line interface."""

    options, args = PARSER.parse_args()

    # set environment
    if options.host:
        refine.REFINE_HOST = options.host
    if options.port:
        refine.REFINE_PORT = options.port

    # get project_id
    if args and not str.isdigit(args[0]):
        projects = refine.Refine(refine.RefineServer()).list_projects().items()
        idlist = []
        for project_id, project_info in projects:
            if args[0].decode('UTF-8') == project_info['name']:
                idlist.append(str(project_id))
        if len(idlist) > 1:
            print('Error: Found %s projects with name %s.\n'
                  'Please specify project by id.' % (len(idlist), args[0]))
            for i in idlist:
                print('')
                cli.info(i)
            return
        else:
            try:
                project_id = idlist[0]
            except IndexError:
                print('Error: No project found with name %s.\n'
                      'Try command --list' % args[0])
                return
    elif args:
        project_id = args[0]

    # commands without args
    if options.list:
        cli.ls()
    elif options.download:
        cli.download(options.download, output_file=options.output)
    elif options.create:
        group5_dict = {
            group5_arg.dest: getattr(options, group5_arg.dest)
            for group5_arg in group5.option_list
        }
        kwargs = {
            k: v
            for k, v in group5_dict.items()
            if v is not None and v not in ['true', 'false']
        }
        kwargs.update({k: True for k, v in group5_dict.items() if v == 'true'})
        kwargs.update(
            {k: False
             for k, v in group5_dict.items() if v == 'false'})
        if options.file_format:
            kwargs.update({'project_format': options.file_format})
        cli.create(options.create, **kwargs)
    # commands with args
    elif args and options.info:
        cli.info(project_id)
    elif args and options.delete:
        cli.delete(project_id)
    elif args and options.apply:
        cli.apply(project_id, options.apply)
    elif args and options.template:
        group6_dict = {
            group6_arg.dest: getattr(options, group6_arg.dest)
            for group6_arg in group6.option_list
        }
        kwargs = {
            k: v
            for k, v in group6_dict.items()
            if v is not None and v not in ['true', 'false']
        }
        kwargs.update({k: True for k, v in group6_dict.items() if v == 'true'})
        kwargs.update(
            {k: False
             for k, v in group6_dict.items() if v == 'false'})
        cli.templating(project_id,
                       options.template,
                       output_file=options.output,
                       **kwargs)
    elif args and (options.export or options.output):
        cli.export(project_id,
                   output_file=options.output,
                   export_format=options.file_format)
    else:
        PARSER.print_usage()
def main():
    r = refine.Refine(SERVER)
    for id, d1 in r.list_projects().items():
        if d1['name'].startswith('Test project'):
            r.open_project(id).delete()
Example #20
0
def main():
    """Command line interface."""

    options = parser.parse_args()
    # set environment
    if options.host:
        refine.REFINE_HOST = options.host
    if options.port:
        refine.REFINE_PORT = options.port

    # get project_id
    if options.project_id and str.isdigit(options.project_id):
        project_id = options.project_id
    elif options.project_id:
        projects = refine.Refine(refine.RefineServer()).list_projects().items()
        idlist = []
        for project_id, project_info in projects:
            if options.project_id == project_info['name']:
                idlist.append(str(project_id))
        if len(idlist) > 1:
            print('Error: Found {idlist} projects with name {name}.\n'
                  'Please specify project by id.'.format(
                      idlist=len(idlist), name=options.project_id))
            for i in idlist:
                print('')
                cli.info(i)
            return
        else:
            try:
                project_id = idlist[0]
            except IndexError:
                print('Error: No project found with name {}.\n'
                      'Try command --list'.format(options.project_id))
                return

    # commands without args
    if options.list:
        cli.ls()
    elif options.download:
        cli.download(options.download, output_file=options.output)
    elif options.create:
        arg_dict = {
            arg.dest: getattr(options, arg.dest)
            for arg in CreateGroup._group_actions
        }
        kwargs = {
            k: v
            for k, v in arg_dict.items()
            if v is not None and v not in ['true', 'false']
        }
        kwargs.update({k: True for k, v in arg_dict.items() if v == 'true'})
        kwargs.update({k: False for k, v in arg_dict.items() if v == 'false'})
        if options.file_format:
            kwargs.update({'project_format': options.file_format})
        cli.create(options.create, **kwargs)
    # commands with args
    elif options.info:
        cli.info(project_id)
    elif options.delete:
        cli.delete(project_id)
    elif options.apply:
        cli.apply(project_id, options.apply)
    elif options.template:
        arg_dict = {
            arg.dest: getattr(options, arg.dest)
            for arg in TemplateGroup._group_actions
        }
        kwargs = {
            k: v
            for k, v in arg_dict.items()
            if v is not None and v not in ['true', 'false']
        }
        kwargs.update({k: True for k, v in arg_dict.items() if v == 'true'})
        kwargs.update({k: False for k, v in arg_dict.items() if v == 'false'})
        if options.file_format:
            kwargs.update({'project_format': options.file_format})
    elif options.export or options.output:
        cli.export(project_id,
                   output_file=options.output,
                   export_format=options.file_format)
    else:
        parser.print_help()
Example #21
0
def get_refine_ws():
    """Returns a new Google Refine workspace."""
    server_url = get_refine_server_url()
    return refine.Refine(server_url)
Example #22
0
first, count, record = True, 0, ''
with open(DATA, 'rb') as inp:
    with open(PROCESSED, 'wb') as out:
        for line in inp:
            if first:
                first = False
                continue
            if count == 50:
                out.write(record.rstrip('#####')+'\n')
                count, record = 0, ''
            record += line.strip() + '#####'
            count += 1
        if record:
            out.write(record.rstrip('#####')+'\n')

# Now create the sample project using sampled data
sampling_ratio = 0.1
no_of_sampled_lines = int(sampling_ratio * no_of_lines)
subprocess.Popen(['cp', HEADER, SAMPLED]).communicate()
with open(SAMPLED, 'ab') as out:
    subprocess.Popen(['shuf', '-n {0}'.format(no_of_sampled_lines), \
        DATA], stdout=out).communicate()

server = refine.RefineServer()
refine = refine.Refine(server)
project = refine.new_project(project_file=SAMPLED, \
                             project_format=_format,\
                             project_options=_options)
print "Done"
print "Open: " +  project.project_url()