def extract_affiliations(project_id, modified_only=False):
    """
    Formats the raw output from Refine into an ADS-readable file.
    """
    p = refine.RefineProject(SERVER, project_id=project_id)

    # Check the columns.
    if p.columns != ['Original affiliation', 'New affiliation', 'Original emails', 'New emails', 'Bibcodes and positions']:
        raise Exception('ERROR: Columns are not as expected.')

    rows = p.export(export_format='tsv')
    # Skip the first row that contains the column names.
    rows.next()
    for row in rows:
        row = UNICODE_HANDLER.u2ent(row[:-1].decode('utf_8'))
        original_aff, new_aff, original_emails, new_emails, bibcodes = row.split('\t')

        original = rebuild_affiliation(original_aff, original_emails)
        new = rebuild_affiliation(new_aff, new_emails)

        if modified_only and original == new:
            continue

        for bibcode in bibcodes.split(' '):
            bibcode, position = bibcode.split(',', 1)
            yield  '%s\t%s\t%s' % (bibcode, position, new)
Ejemplo n.º 2
0
 def test_open_export(self):
     fp = refine.RefineProject(self.project.project_url()).export()
     line = fp.next()
     self.assertTrue('email' in line)
     for line in fp:
         self.assertTrue('M' in line or 'F' in line)
     fp.close()
Ejemplo n.º 3
0
def export_csv_to_rdf(proj_name, input_file, encoding, json_file, output_file):
    """Exports CSV to RDF via OpenRefine using a JSON file."""
    server = refine.RefineServer()
    LOGGER.debug("Connected to OpenRefine")

    options_json = get_options(proj_name, input_file)
    opts = {}
    new_style_options = dict(opts, **{
        'encoding': encoding,
    })
    params = {
        'options': json.dumps(new_style_options),
    }
    resp = server.urlopen('create-project-from-upload', options_json, params)
    url_params = urlparse.parse_qs(urlparse.urlparse(resp.geturl()).query)

    if 'project' in url_params:
        project_id = url_params['project'][0]
        LOGGER.debug("Created project with project id: %s", project_id)
        proj = refine.RefineProject(project_id)
        update_project_file(project_id)
    else:
        raise Exception('Project not created')

    apply_operations(proj, json_file)
    export_project(proj, output_file)
    if output_file:
        LOGGER.debug("RDF exported to: %s", output_file)
Ejemplo n.º 4
0
 def test_open_export_csv(self):
     fp = refine.RefineProject(self.project.project_url()).export()
     csv_fp = csv.reader(fp, dialect='excel-tab')
     row = csv_fp.next()
     self.assertTrue(row[0] == 'email')
     for row in csv_fp:
         self.assertTrue(row[3] == 'F' or row[3] == 'M')
     fp.close()
Ejemplo n.º 5
0
def delete(project_id):
    """Delete project."""
    project = refine.RefineProject(project_id)
    response = project.delete()
    if response != True:
        raise Exception('Failed to delete %s: %s' % (project_id, response))
    else:
        print('Project %s has been successfully deleted' % project_id)
Ejemplo n.º 6
0
def delete(project_id):
    """Delete project."""
    project = refine.RefineProject(project_id)
    response = project.delete()
    if response != True:
        raise Exception('Failed to delete {}: {}'.format(project_id, response))
    else:
        print('Project {} has been successfully deleted'.format(project_id))
Ejemplo n.º 7
0
def apply(project_id, history_file):
    """Apply OpenRefine history from json file to project."""
    project = refine.RefineProject(project_id)
    response = project.apply_operations(history_file)
    if response != 'ok':
        raise Exception('Failed to apply %s to %s: %s' %
                        (history_file, project_id, response))
    else:
        print('File %s has been successfully applied to project %s' %
              (history_file, project_id))
Ejemplo n.º 8
0
 def deleteOR(refineproj=None):
     OPENREFINE_SERVER = current_app.config.get("OPENREFINE_SERVER")
     if not refineproj:
         refiner = refine.RefineProject(server=OPENREFINE_SERVER,
                                        project_id=int(
                                            self.openrefine_projectnumber))
     else:
         refiner = refineproj
     refiner.delete()
     self.openrefine_projectnumber = ""
     self.save()
     return True
Ejemplo n.º 9
0
def update_project_file(project_id):
    """Write on file the project id."""
    if not os.path.exists(LAST_PROJECT_FILE):
        update_file = open(LAST_PROJECT_FILE, 'w')
        update_file.write("%s\n" % project_id)
        update_file.close()
    else:
        update_file = open(LAST_PROJECT_FILE, 'r')
        last_project_created = update_file.readline().rstrip()
        update_file.close()
        LOGGER.debug("Deleting project id: %s", last_project_created)
        refine.RefineProject(last_project_created).delete()
        update_file = open(LAST_PROJECT_FILE, 'w')
        update_file.write("%s\n" % project_id)
        update_file.close()
Ejemplo n.º 10
0
    def __init__(self, source=None):
        OPENREFINE_SERVER = current_app.config.get("OPENREFINE_SERVER")
        self.refine_server = refine.Refine(server=OPENREFINE_SERVER)
        #the source object will have the project_id of OR
        #if project_id does not exist then createOR

        #check that source exists and is good
        #bad URLS and bad files are not possible

        self.source = source

        if not source.ORid:
            self.refineproj = self.createOR(source)
        else:
            self.refineproj = refine.RefineProject(server=OPENREFINE_SERVER,
                                                   project_id=str(
                                                       int(self.source.ORid)))
Ejemplo n.º 11
0
def info(project_id):
    """Show project metadata"""
    projects = refine.Refine(refine.RefineServer()).list_projects()
    if project_id in projects.keys():
        print('{0:>20}: {1}'.format('id', project_id))
        print('{0:>20}: {1}'.format(
            'url', 'http://' + refine.REFINE_HOST + ':' + refine.REFINE_PORT +
            '/project?project=' + project_id))
        for k, v in projects[project_id].items():
            if v:
                print(u'{0:>20}: {1}'.format(k, v))
        project_model = refine.RefineProject(project_id).get_models()
        columns = [c['name'] for c in project_model['columnModel']['columns']]
        for (i, v) in enumerate(columns, start=1):
            print(u'{0:>20}: {1}'.format(u'column ' + str(i).zfill(3), v))
    else:
        print('Error: No project found with id %s.\n'
              'Check existing projects with command --list' % (project_id))
Ejemplo n.º 12
0
def export(project_id, encoding=None, output_file=None, export_format=None):
    """Dump a project to stdout or file."""
    project = refine.RefineProject(project_id)
    if not export_format:
        export_format = 'tsv'
    if not output_file:
        if export_format in ['csv', 'tsv', 'txt']:
            encoding = 'UTF-8'
        sys.stdout.write(
            project.export(export_format=export_format,
                           encoding=encoding).read())
    else:
        ext = os.path.splitext(output_file)[1][1:]
        if ext:
            export_format = ext.lower()
        if export_format in ['csv', 'tsv', 'txt']:
            encoding = 'UTF-8'
        with open(output_file, 'wb') as f:
            f.write(
                project.export(export_format=export_format,
                               encoding=encoding).read())
        print('Export to file %s complete' % output_file)
Ejemplo n.º 13
0
def main():
    """Main."""
    options, args = PARSER.parse_args()

    if options.host:
        refine.REFINE_HOST = options.host
    if options.port:
        refine.REFINE_PORT = options.port

    if not options.list and len(args) != 1:
        PARSER.print_usage()
    if options.list:
        list_projects()
    if args:
        project = refine.RefineProject(args[0])
        if options.apply:
            response = project.apply_operations(options.apply)
            if response != 'ok':
                print('Failed to apply %s: %s' % (options.apply, response),
                      file=sys.stderr)
        if options.export:
            export_project(project, options)

        return project
Ejemplo n.º 14
0
def getToValue(computeCluster):
    result=[
        max(list_of_dicts, key=lambda d: d['count'])
        for list_of_dicts in computeCluster
    ]
    chosenvaluelist=[]
    for chosendict in result:
        chosenvaluelist.append(chosendict['value'])
    return chosenvaluelist

# @begin RenameColumn @desc Rename column name to make original table more meaningful
# @in projectID
# @in oldColumnName
# @in newColumnName
# @out table1
refine.RefineProject(refine.RefineServer(),projectID).rename_column('notes','commands')
# @end RenameColumn


# @begin OperationsColSponsor @desc OpenRefine operations on column sponsor
# @in table1
# @in projectID
# @in columnName:"sponsor"
# @in expression:"value.trim()"
# @in expression:"value.toLowercase()"
# @in clusterer_type:"binning"
# @in function:"ngram-fingerprint"
# @in function:getToValue
# @in function:getFromValue
# @in params:"20"
# @out table1-Sponsor
Ejemplo n.º 15
0
def main():
    """Command line interface."""

    # get environment variables in docker network
    docker_host = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_ADDR')
    if docker_host:
        os.environ["OPENREFINE_HOST"] = docker_host
        refine.REFINE_HOST = docker_host
    docker_port = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_PORT')
    if docker_port:
        os.environ["OPENREFINE_PORT"] = docker_port
        refine.REFINE_PORT = docker_port

    options, args = PARSER.parse_args()
    commands_dict = {
        group2_arg.dest: getattr(options, group2_arg.dest)
        for group2_arg in group2.option_list
    }
    commands_dict.update({
        group3_arg.dest: getattr(options, group3_arg.dest)
        for group3_arg in group3.option_list
    })
    commands_dict = {k: v for k, v in commands_dict.items() if v != None}
    if not commands_dict:
        PARSER.print_usage()
        return
    if options.host:
        refine.REFINE_HOST = options.host
    if options.port:
        refine.REFINE_PORT = options.port
    if args and not str.isdigit(args[0]):
        projects = refine.Refine(refine.RefineServer()).list_projects().items()
        idlist = []
        for project_id, project_info in projects:
            if args[0] == project_info['name']:
                idlist.append(str(project_id))
        if len(idlist) > 1:
            raise Exception(
                'Found at least two projects. Please specify project by id.')
        else:
            args[0] = idlist[0]

    if options.list:
        list_projects()
    if options.create:
        create_project(options)
    if options.delete:
        project = refine.RefineProject(args[0])
        project.delete()
    if options.apply:
        project = refine.RefineProject(args[0])
        response = project.apply_operations(options.apply)
        if response != 'ok':
            print >> sys.stderr, 'Failed to apply %s: %s' \
                % (options.apply, response)
        return project
    if options.export or options.output:
        project = refine.RefineProject(args[0])
        export_project(project, options)
        return project
    if options.info:
        info(args[0])
        project = refine.RefineProject(args[0])
        return project
Ejemplo n.º 16
0
def templating(project_id,
               template,
               encoding='UTF-8',
               output_file=None,
               mode=None,
               prefix='',
               rowSeparator='\n',
               suffix='',
               filterQuery=None,
               filterColumn=None,
               facets=None,
               splitToFiles=False,
               suffixById=None):
    """Dump a project to stdout or file with templating."""
    project = refine.RefineProject(project_id)

    # basic config
    templateconfig = {
        'prefix': prefix,
        'suffix': suffix,
        'template': template,
        'rowSeparator': rowSeparator,
        'encoding': encoding
    }

    # construct the engine config
    if mode == 'record-based':
        engine = {'facets': [], 'mode': 'record-based'}
    else:
        engine = {'facets': [], 'mode': 'row-based'}
    if facets:
        engine['facets'].append(json.loads(facets))
    if filterQuery:
        if not filterColumn:
            filterColumn = project.get_models()['columnModel']['keyColumnName']
        textFilter = {
            'type': 'text',
            'name': filterColumn,
            'columnName': filterColumn,
            'mode': 'regex',
            'caseSensitive': False,
            'query': filterQuery
        }
        engine['facets'].append(textFilter)
    templateconfig.update({'engine': json.dumps(engine)})

    if not splitToFiles:
        # normal output
        if not output_file:
            sys.stdout.write(
                project.export_templating(**templateconfig).read())
        else:
            with open(output_file, 'wb') as f:
                f.write(project.export_templating(**templateconfig).read())
            print('Export to file %s complete' % output_file)
    else:
        # splitToFiles functionality
        prefix = templateconfig['prefix']
        suffix = templateconfig['suffix']
        split = '===|||THISISTHEBEGINNINGOFANEWRECORD|||==='
        if not output_file:
            output_file = time.strftime('%Y%m%d')
        else:
            base = os.path.splitext(output_file)[0]
            ext = os.path.splitext(output_file)[1][1:]
        if not ext:
            ext = 'txt'
        # generate config for subfeature suffixById
        if suffixById:
            ids_template = ('{{forNonBlank(' +
                            'with(row.columnNames[0],cn,cells[cn].value),' +
                            'v,v,"")}}')
            ids_templateconfig = {
                'engine': json.dumps(engine),
                'template': ids_template,
                'rowSeparator': '\n',
                'encoding': encoding
            }
            ids = [
                line.rstrip('\n')
                for line in project.export_templating(**ids_templateconfig)
                if line.rstrip('\n')
            ]
        # generate common config
        if mode == 'record-based':
            # record-based: split-character into template
            #               if key column is not blank (=record)
            template = ('{{forNonBlank(' +
                        'with(row.columnNames[0],cn,cells[cn].value),' +
                        'v,"' + split + '", "")}}' +
                        templateconfig['template'])
            templateconfig.update({
                'prefix': '',
                'suffix': '',
                'template': template,
                'rowSeparator': ''
            })
        else:
            # row-based: split-character into template
            template = split + templateconfig['template']
            templateconfig.update({
                'prefix': '',
                'suffix': '',
                'template': template,
                'rowSeparator': ''
            })
        # execute
        records = project.export_templating(
            **templateconfig).read().split(split)
        del records[0]  # skip first blank entry
        if suffixById:
            for index, record in enumerate(records):
                output_file = base + '_' + ids[index] + '.' + ext
                with open(output_file, 'wb') as f:
                    f.writelines([prefix, record, suffix])
            print('Export to files complete. Last file: %s' % output_file)
        else:
            zeros = len(str(len(records)))
            for index, record in enumerate(records):
                output_file = base + '_' + \
                    str(index + 1).zfill(zeros) + '.' + ext
                with open(output_file, 'wb') as f:
                    f.writelines([prefix, record, suffix])
            print('Export to files complete. Last file: %s' % output_file)