Python Solr.commit Exemples, SolrAPI.Solr.commit Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : file2index.py Projet : jamilatta/sandbox

def main():

    solr = Solr('http://localhost:8080/solr/nem', timeout=30)

    solr.delete('*:*')
    data_list = []

    for count, line in enumerate(open(DATA_FILE)):

        d = {'id': line[0:12], 'text': line[536: 545]}
        data_list.append(d)
        print count

        if len(data_list) == 10000:
            print "Sending..."
            solr.update(json.dumps(data_list), headers={'Content-Type': 'text/json'})
            print "Commiting..."
            solr.commit()
            #Clean data_list
            data_list = []

    solr.optimize()

Exemple #2

0

Afficher le fichier

Fichier : accesses.py Projet : jamilatta/search-journals-proc

class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """
    def __init__(self, collection=None, issn=None):
        self.collection = collection
        self.issn = issn
        self.solr = Solr(SOLR_URL, timeout=10)

    def set_accesses(self, document_id, accesses):

        xml = ET.Element('add')

        doc = ET.Element('doc')

        identifier = ET.Element('field')
        identifier.set('name', 'id')
        identifier.text = document_id

        total_accesses = ET.Element('field')
        total_accesses.set('name', 'total_access')
        total_accesses.text = str(accesses)
        total_accesses.set('update', 'set')
        doc.append(identifier)
        doc.append(total_accesses)

        xml.append(doc)

        return ET.tostring(xml, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ArticleMetaThriftClient()
        art_accesses = AccessThriftClient(domain="ratchet.scielo.org:11660")

        logger.info("Loading Solr available document ids")
        itens_query = []

        if self.collection:
            itens_query.append('in:%s' % self.collection)

        if self.issn:
            itens_query.append('issn:%s' % self.issn)

        query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)

        available_ids = set([
            i['id'] for i in json.loads(
                self.solr.select({
                    'q': query,
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']
        ])

        logger.info("Recording accesses for documents in {0}".format(
            self.solr.url))

        for document in art_meta.documents(collection=self.collection,
                                           issn=self.issn):

            solr_id = '-'.join(
                [document.publisher_id, document.collection_acronym])

            if solr_id not in available_ids:
                continue

            logger.debug("Loading accesses for document %s" % solr_id)

            total_accesses = int(
                art_accesses.document(document.publisher_id,
                                      document.collection_acronym).get(
                                          'access_total',
                                          {'value': 0})['value'])

            xml = self.set_accesses(solr_id, total_accesses)

            try:
                result = self.solr.update(xml, commit=False)
            except ValueError as e:
                logger.error("ValueError: {0}".format(e))
                logger.exception(e)
                continue
            except Exception as e:
                logger.error("Error: {0}".format(e))
                logger.exception(e)
                continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()

Exemple #3

0

Afficher le fichier

class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """

    usage = """\
    Process to index article to SciELO Solr.

    This process collects articles in the Article meta using thrift and index
    in SciELO Solr.

    With this process it is possible to process all the article or some specific
    by collection, issn from date to until another date and a period like 7 days.
    """

    parser = argparse.ArgumentParser(textwrap.dedent(usage))

    parser.add_argument(
        '-p',
        '--period',
        type=int,
        help='index articles from specific period, use number of days.')

    parser.add_argument('-f',
                        '--from',
                        dest='from_date',
                        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
                        nargs='?',
                        help='index articles from specific date. YYYY-MM-DD.')

    parser.add_argument(
        '-u',
        '--until',
        dest='until_date',
        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
        nargs='?',
        help=
        'index articles until this specific date. YYYY-MM-DD (default today).',
        default=datetime.now())

    parser.add_argument(
        '-c',
        '--collection',
        dest='collection',
        default=None,
        help='use the acronym of the collection eg.: spa, scl, col.')

    parser.add_argument('-i',
                        '--issn',
                        dest='issn',
                        default=None,
                        help='journal issn.')

    parser.add_argument('-d',
                        '--delete',
                        dest='delete',
                        default=None,
                        help='delete query ex.: q=*:* (Lucene Syntax).')

    parser.add_argument(
        '-s',
        '--sanitization',
        dest='sanitization',
        default=False,
        action='store_true',
        help=
        'Remove objects from the index that are no longer present in the database.'
    )

    parser.add_argument(
        '-url',
        '--url',
        dest='solr_url',
        help=
        'Solr RESTFul URL, processing try to get the variable from environment ``SOLR_URL`` otherwise use --url to set the url(preferable).'
    )

    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='version: 0.2')

    def __init__(self):

        self.args = self.parser.parse_args()

        solr_url = os.environ.get('SOLR_URL')

        if not solr_url and not self.args.solr_url:
            raise argparse.ArgumentTypeError(
                '--url or ``SOLR_URL`` enviroment variable must be the set, use --help.'
            )

        if not solr_url:
            self.solr = Solr(self.args.solr_url, timeout=10)
        else:
            self.solr = Solr(solr_url, timeout=10)

        if self.args.period:
            self.args.from_date = datetime.now() - timedelta(
                days=self.args.period)

    def format_date(self, date):
        """
        Convert datetime.datetime to str return: ``2000-05-12``.

        :param datetime: bult-in datetime object

        :returns: str
        """
        if not date:
            return None

        return date.strftime('%Y-%m-%d')

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        ppl = plumber.Pipeline(pipeline_xml.SetupDocument(),
                               pipeline_xml.DocumentID(), pipeline_xml.DOI(),
                               pipeline_xml.Collection(),
                               pipeline_xml.DocumentType(), pipeline_xml.URL(),
                               pipeline_xml.Authors(), pipeline_xml.Titles(),
                               pipeline_xml.OriginalTitle(),
                               pipeline_xml.Pages(), pipeline_xml.WOKCI(),
                               pipeline_xml.WOKSC(),
                               pipeline_xml.JournalAbbrevTitle(),
                               pipeline_xml.Languages(),
                               pipeline_xml.AvailableLanguages(),
                               pipeline_xml.Fulltexts(),
                               pipeline_xml.PublicationDate(),
                               pipeline_xml.SciELOPublicationDate(),
                               pipeline_xml.SciELOProcessingDate(),
                               pipeline_xml.Abstract(),
                               pipeline_xml.AffiliationCountry(),
                               pipeline_xml.AffiliationInstitution(),
                               pipeline_xml.Sponsor(), pipeline_xml.Volume(),
                               pipeline_xml.SupplementVolume(),
                               pipeline_xml.Issue(),
                               pipeline_xml.SupplementIssue(),
                               pipeline_xml.ElocationPage(),
                               pipeline_xml.StartPage(),
                               pipeline_xml.EndPage(),
                               pipeline_xml.JournalTitle(),
                               pipeline_xml.IsCitable(),
                               pipeline_xml.Permission(),
                               pipeline_xml.Keywords(),
                               pipeline_xml.JournalISSNs(),
                               pipeline_xml.SubjectAreas(),
                               pipeline_xml.ReceivedCitations(),
                               pipeline_xml.TearDown())

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        elif self.args.sanitization:

            # set of index ids
            ind_ids = set()

            # set of articlemeta ids
            art_ids = set()

            # all ids in index
            list_ids = json.loads(
                self.solr.select({
                    'q': '*:*',
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(only_identifiers=True):
                if item.collection not in ALLOWED_COLLECTION:
                    continue
                art_ids.add('%s-%s' % (item.code, item.collection))

            # Ids to remove
            remove_ids = ind_ids - art_ids

            for id in remove_ids:
                self.solr.delete('id:%s' % id, commit=True)

            logger.info("List of removed ids: %s" % remove_ids)

        else:

            # Get article identifiers

            logger.info("Indexing in {0}".format(self.solr.url))

            for document in art_meta.documents(
                    collection=self.args.collection,
                    issn=self.args.issn,
                    from_date=self.format_date(self.args.from_date),
                    until_date=self.format_date(self.args.until_date)):

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document),
                                     commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()

Exemple #4

0

Afficher le fichier

class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """

    def __init__(self, period=None, from_date=None, until_date=None,
                 collection=None, issn=None, delete=False, differential=False,
                 load_indicators=False):
        self.delete = delete
        self.collection = collection
        self.from_date = from_date
        self.until_date = until_date
        self.differential = differential
        self.load_indicators = load_indicators
        self.issn = issn
        self.solr = Solr(SOLR_URL, timeout=10)
        if period:
            self.from_date = datetime.now() - timedelta(days=period)

    def format_date(self, date):
        """
        Convert datetime.datetime to str return: ``2000-05-12``.

        :param datetime: bult-in datetime object

        :returns: str
        """
        if not date:
            return None

        return date.strftime('%Y-%m-%d')

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        pipeline_itens = [
            pipeline_xml.SetupDocument(),
            pipeline_xml.DocumentID(),
            pipeline_xml.DOI(),
            pipeline_xml.Collection(),
            pipeline_xml.DocumentType(),
            pipeline_xml.URL(),
            pipeline_xml.Authors(),
            pipeline_xml.Orcid(),
            pipeline_xml.Titles(),
            pipeline_xml.OriginalTitle(),
            pipeline_xml.Pages(),
            pipeline_xml.WOKCI(),
            pipeline_xml.WOKSC(),
            pipeline_xml.JournalAbbrevTitle(),
            pipeline_xml.Languages(),
            pipeline_xml.AvailableLanguages(),
            pipeline_xml.Fulltexts(),
            pipeline_xml.PublicationDate(),
            pipeline_xml.SciELOPublicationDate(),
            pipeline_xml.SciELOProcessingDate(),
            pipeline_xml.Abstract(),
            pipeline_xml.AffiliationCountry(),
            pipeline_xml.AffiliationInstitution(),
            pipeline_xml.Sponsor(),
            pipeline_xml.Volume(),
            pipeline_xml.SupplementVolume(),
            pipeline_xml.Issue(),
            pipeline_xml.SupplementIssue(),
            pipeline_xml.ElocationPage(),
            pipeline_xml.StartPage(),
            pipeline_xml.EndPage(),
            pipeline_xml.JournalTitle(),
            pipeline_xml.IsCitable(),
            pipeline_xml.Permission(),
            pipeline_xml.Keywords(),
            pipeline_xml.JournalISSNs(),
            pipeline_xml.SubjectAreas()
        ]

        if self.load_indicators is True:
            pipeline_itens.append(pipeline_xml.ReceivedCitations())

        pipeline_itens.append(pipeline_xml.TearDown())

        ppl = plumber.Pipeline(*pipeline_itens)

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def differential_mode(self):
        art_meta = ThriftClient()

        logger.info("Running with differential mode")
        ind_ids = set()
        art_ids = set()

        # all ids in search index
        logger.info("Loading Search Index ids.")
        itens_query = []
        if self.collection:
            itens_query.append('in:%s' % self.collection)

        if self.issn:
            itens_query.append('issn:%s' % self.issn)

        query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)
        list_ids = json.loads(self.solr.select(
            {'q': query, 'fl': 'id,scielo_processing_date', 'rows': 1000000}))['response']['docs']

        for id in list_ids:
            ind_ids.add('%s-%s' % (id['id'], id.get('scielo_processing_date', '1900-01-01')))

        # all ids in articlemeta
        logger.info("Loading ArticleMeta ids.")
        for item in art_meta.documents(
            collection=self.collection,
            issn=self.issn,
            only_identifiers=True
        ):
            art_ids.add('%s-%s-%s' % (item.code, item.collection, item.processing_date))

        # Ids to remove
        if self.delete is True:
            logger.info("Running remove records process.")
            remove_ids = set([i[:27] for i in ind_ids]) - set([i[:27] for i in art_ids])
            logger.info("Removing (%d) documents from search index." % len(remove_ids))
            total_to_remove = len(remove_ids)
            if total_to_remove > 0:
                for ndx, to_remove_id in enumerate(remove_ids, 1):
                    logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id))
                    self.solr.delete('id:%s' % to_remove_id, commit=False)

        # Ids to include
        logger.info("Running include records process.")
        include_ids = art_ids - ind_ids
        logger.info("Including (%d) documents to search index." % len(include_ids))
        total_to_include = len(include_ids)
        if total_to_include > 0:
            for ndx, to_include_id in enumerate(include_ids, 1):
                logger.debug("Including (%d/%d): %s" % (ndx, total_to_include, to_include_id))
                code = to_include_id[:23]
                collection = to_include_id[24: 27]
                processing_date = to_include_id[:-11]
                document = art_meta.document(code=code, collection=collection)
                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(xml, commit=False)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

    def common_mode(self):
        art_meta = ThriftClient()

        logger.info("Running without differential mode")
        logger.info("Indexing in {0}".format(self.solr.url))
        for document in art_meta.documents(
            collection=self.collection,
            issn=self.issn,
            from_date=self.format_date(self.from_date),
            until_date=self.format_date(self.until_date)
        ):

            logger.debug("Loading document %s" % '_'.join([document.collection_acronym, document.publisher_id]))

            try:
                xml = self.pipeline_to_xml(document)
                self.solr.update(xml, commit=False)
            except ValueError as e:
                logger.error("ValueError: {0}".format(e))
                logger.exception(e)
                continue
            except Exception as e:
                logger.error("Error: {0}".format(e))
                logger.exception(e)
                continue

        if self.delete is True:
            logger.info("Running remove records process.")
            ind_ids = set()
            art_ids = set()

            itens_query = []
            if self.collection:
                itens_query.append('in:%s' % self.collection)

            if self.issn:
                itens_query.append('issn:%s' % self.issn)

            query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)

            list_ids = json.loads(self.solr.select(
                {'q': query, 'fl': 'id', 'rows': 1000000}))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(
                collection=self.collection,
                issn=self.issn,
                only_identifiers=True
            ):
                art_ids.add('%s-%s' % (item.code, item.collection))
            # Ids to remove
            total_to_remove = len(remove_ids)
            logger.info("Removing (%d) documents from search index." % len(remove_ids))
            remove_ids = ind_ids - art_ids
            for ndx, to_remove_id in enumerate(remove_ids, 1):
                logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id))
                self.solr.delete('id:%s' % to_remove_id, commit=False)

    def run(self):
        """
        Run the process for update article in Solr.
        """
        if self.differential is True:
            self.differential_mode()
        else:
            self.common_mode()

        # optimize the index
        self.solr.commit()
        self.solr.optimize()

Exemple #5

0

Afficher le fichier

Fichier : updatepreprint.py Projet : paratiuid/search-journals

class UpdatePreprint(object):
    """
    Process to get article in Pre-Print Server and index in Solr.
    """

    usage = """\
    Process to index Pre-Prints articles to SciELO Solr.
    """

    parser = argparse.ArgumentParser(textwrap.dedent(usage))

    parser.add_argument(
        '-p',
        '--period',
        type=int,
        help='index articles from specific period, use number of hours.')

    parser.add_argument(
        '-d',
        '--delete',
        dest='delete',
        help='delete query ex.: q=type:"preprint (Lucene Syntax).')

    parser.add_argument(
        '-solr_url',
        '--solr_url',
        dest='solr_url',
        help=
        'Solr RESTFul URL, processing try to get the variable from environment ``SOLR_URL`` otherwise use --solr_url to set the solr_url (preferable).'
    )

    parser.add_argument(
        '-oai_url',
        '--oai_url',
        dest='oai_url',
        default="https://preprints.scielo.org/index.php/scielo/oai",
        help=
        'OAI URL, processing try to get the variable from environment ``OAI_URL`` otherwise use --oai_url to set the oai_url (preferable).'
    )

    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='version: 0.1-beta')

    def __init__(self):

        self.args = self.parser.parse_args()

        solr_url = os.environ.get('SOLR_URL')
        oai_url = os.environ.get('OAI_URL')

        if not solr_url and not self.args.solr_url:
            raise argparse.ArgumentTypeError(
                '--solr_url or ``SOLR_URL`` enviroment variable must be the set, use --help.'
            )

        if not oai_url and not self.args.oai_url:
            raise argparse.ArgumentTypeError(
                '--oai_url or ``OAI_URL`` enviroment variable must be the set, use --help.'
            )

        if not solr_url:
            self.solr = Solr(self.args.solr_url, timeout=10)
        else:
            self.solr = Solr(solr_url, timeout=10)

        if self.args.period:
            self.from_date = datetime.now() - timedelta(hours=self.args.period)

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        ppl = plumber.Pipeline(pipeline_xml.SetupDocument(),
                               pipeline_xml.DocumentID(), pipeline_xml.URL(),
                               pipeline_xml.DOI(), pipeline_xml.Languages(),
                               pipeline_xml.Fulltexts(),
                               pipeline_xml.PublicationDate(),
                               pipeline_xml.Keywords(),
                               pipeline_xml.Collection(),
                               pipeline_xml.DocumentType(),
                               pipeline_xml.Titles(), pipeline_xml.Abstract(),
                               pipeline_xml.Authors(), pipeline_xml.TearDown())

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update Pre-prints in Solr.
        """

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        else:

            logger.info("Indexing in {0}".format(self.solr.url))

            sickle = Sickle(self.args.oai_url)

            records = sickle.ListRecords(
                **{
                    'metadataPrefix': 'oai_dc',
                    'from': self.from_date.strftime("%Y-%m-%dT%H:%M:%SZ")
                })

            for record in records:
                try:
                    xml = self.pipeline_to_xml(record.xml)
                    self.solr.update(xml, commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()

Exemple #6

0

Afficher le fichier

class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """
    def __init__(self,
                 period=None,
                 from_date=None,
                 until_date=None,
                 collection=None,
                 issn=None,
                 delete=False,
                 sanitization=False):
        self.delete = delete
        self.sanitization = sanitization
        self.collection = collection
        self.from_date = from_date
        self.until_date = until_date
        self.issn = issn
        self.solr = Solr(SOLR_URL, timeout=10)
        if period:
            self.from_date = datetime.now() - timedelta(days=period)

    def format_date(self, date):
        """
        Convert datetime.datetime to str return: ``2000-05-12``.

        :param datetime: bult-in datetime object

        :returns: str
        """
        if not date:
            return None

        return date.strftime('%Y-%m-%d')

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        ppl = plumber.Pipeline(pipeline_xml.SetupDocument(),
                               pipeline_xml.DocumentID(), pipeline_xml.DOI(),
                               pipeline_xml.Collection(),
                               pipeline_xml.DocumentType(), pipeline_xml.URL(),
                               pipeline_xml.Authors(), pipeline_xml.Titles(),
                               pipeline_xml.OriginalTitle(),
                               pipeline_xml.Pages(), pipeline_xml.WOKCI(),
                               pipeline_xml.WOKSC(),
                               pipeline_xml.JournalAbbrevTitle(),
                               pipeline_xml.Languages(),
                               pipeline_xml.AvailableLanguages(),
                               pipeline_xml.Fulltexts(),
                               pipeline_xml.PublicationDate(),
                               pipeline_xml.SciELOPublicationDate(),
                               pipeline_xml.SciELOProcessingDate(),
                               pipeline_xml.Abstract(),
                               pipeline_xml.AffiliationCountry(),
                               pipeline_xml.AffiliationInstitution(),
                               pipeline_xml.Sponsor(), pipeline_xml.Volume(),
                               pipeline_xml.SupplementVolume(),
                               pipeline_xml.Issue(),
                               pipeline_xml.SupplementIssue(),
                               pipeline_xml.ElocationPage(),
                               pipeline_xml.StartPage(),
                               pipeline_xml.EndPage(),
                               pipeline_xml.JournalTitle(),
                               pipeline_xml.IsCitable(),
                               pipeline_xml.Permission(),
                               pipeline_xml.Keywords(),
                               pipeline_xml.JournalISSNs(),
                               pipeline_xml.SubjectAreas(),
                               pipeline_xml.ReceivedCitations(),
                               pipeline_xml.TearDown())

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.delete:

            self.solr.delete(self.delete, commit=True)
        else:

            logger.info("Indexing in {0}".format(self.solr.url))
            for document in art_meta.documents(
                    collection=self.collection,
                    issn=self.issn,
                    from_date=self.format_date(self.from_date),
                    until_date=self.format_date(self.until_date)):

                logger.debug("Loading document %s" % '_'.join(
                    [document.collection_acronym, document.publisher_id]))

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document),
                                     commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        if self.sanitization is True:
            logger.info("Running sanitization process")
            ind_ids = set()
            art_ids = set()

            itens_query = []
            if self.collection:
                itens_query.append('in:%s' % self.collection)

            if self.issn:
                itens_query.append('issn:%s' % self.issn)

            query = '*:*' if len(itens_query) == 0 else ' AND '.join(
                itens_query)

            list_ids = json.loads(
                self.solr.select({
                    'q': query,
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])
            # all ids in articlemeta
            for item in art_meta.documents(collection=self.collection,
                                           issn=self.issn,
                                           only_identifiers=True):
                art_ids.add('%s-%s' % (item.code, item.collection))
            # Ids to remove
            remove_ids = ind_ids - art_ids
            for id in remove_ids:
                logger.debug("Removing id: %s" % id)
                self.solr.delete('id:%s' % id, commit=True)

        # optimize the index
        self.solr.commit()
        self.solr.optimize()

Exemple #7

0

Afficher le fichier

Fichier : updatesearch.py Projet : deandr/search-journals

class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """

    usage = """\
    Process to index article to SciELO Solr.

    This process collects articles in the Article meta using thrift and index
    in SciELO Solr.

    With this process it is possible to process all the article or some specific
    by collection, issn from date to until another date and a period like 7 days.
    """

    parser = argparse.ArgumentParser(textwrap.dedent(usage))

    parser.add_argument('-p', '--period',
                        type=int,
                        help='index articles from specific period, use number of days.')

    parser.add_argument('-f', '--from',
                        dest='from_date',
                        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
                        nargs='?',
                        help='index articles from specific date. YYYY-MM-DD.')

    parser.add_argument('-u', '--until',
                        dest='until_date',
                        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
                        nargs='?',
                        help='index articles until this specific date. YYYY-MM-DD (default today).',
                        default=datetime.now())

    parser.add_argument('-c', '--collection',
                        dest='collection',
                        default=None,
                        help='use the acronym of the collection eg.: spa, scl, col.')

    parser.add_argument('-i', '--issn',
                        dest='issn',
                        default=None,
                        help='journal issn.')

    parser.add_argument('-d', '--delete',
                        dest='delete',
                        default=None,
                        help='delete query ex.: q=*:* (Lucene Syntax).')

    parser.add_argument('-s', '--sanitization',
                        dest='sanitization',
                        default=False,
                        action='store_true',
                        help='Remove objects from the index that are no longer present in the database.')

    parser.add_argument('-url', '--url',
                        dest='solr_url',
                        help='Solr RESTFul URL, processing try to get the variable from environment ``SOLR_URL`` otherwise use --url to set the url(preferable).')

    parser.add_argument('-v', '--version',
                        action='version',
                        version='version: 0.2')

    def __init__(self):

        self.args = self.parser.parse_args()

        solr_url = os.environ.get('SOLR_URL')

        if not solr_url and not self.args.solr_url:
            raise argparse.ArgumentTypeError('--url or ``SOLR_URL`` enviroment variable must be the set, use --help.')

        if not solr_url:
            self.solr = Solr(self.args.solr_url, timeout=10)
        else:
            self.solr = Solr(solr_url, timeout=10)

        if self.args.period:
            self.args.from_date = datetime.now() - timedelta(days=self.args.period)

    def format_date(self, date):
        """
        Convert datetime.datetime to str return: ``2000-05-12``.

        :param datetime: bult-in datetime object

        :returns: str
        """
        if not date:
            return None

        return date.strftime('%Y-%m-%d')

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        ppl = plumber.Pipeline(
            pipeline_xml.SetupDocument(),
            pipeline_xml.DocumentID(),
            pipeline_xml.DOI(),
            pipeline_xml.Collection(),
            pipeline_xml.DocumentType(),
            pipeline_xml.URL(),
            pipeline_xml.Authors(),
            pipeline_xml.Titles(),
            pipeline_xml.OriginalTitle(),
            pipeline_xml.Pages(),
            pipeline_xml.WOKCI(),
            pipeline_xml.WOKSC(),
            pipeline_xml.JournalAbbrevTitle(),
            pipeline_xml.Languages(),
            pipeline_xml.AvailableLanguages(),
            pipeline_xml.Fulltexts(),
            pipeline_xml.PublicationDate(),
            pipeline_xml.SciELOPublicationDate(),
            pipeline_xml.SciELOProcessingDate(),
            pipeline_xml.Abstract(),
            pipeline_xml.AffiliationCountry(),
            pipeline_xml.AffiliationInstitution(),
            pipeline_xml.Sponsor(),
            pipeline_xml.Volume(),
            pipeline_xml.SupplementVolume(),
            pipeline_xml.Issue(),
            pipeline_xml.SupplementIssue(),
            pipeline_xml.ElocationPage(),
            pipeline_xml.StartPage(),
            pipeline_xml.EndPage(),
            pipeline_xml.JournalTitle(),
            pipeline_xml.IsCitable(),
            pipeline_xml.Permission(),
            pipeline_xml.Keywords(),
            pipeline_xml.JournalISSNs(),
            pipeline_xml.SubjectAreas(),
            pipeline_xml.ReceivedCitations(),
            pipeline_xml.TearDown()
        )

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        elif self.args.sanitization:

            # set of index ids
            ind_ids = set()

            # set of articlemeta ids
            art_ids = set()

            # all ids in index
            list_ids = json.loads(self.solr.select(
                                    {'q': '*:*', 'fl': 'id', 'rows': 1000000}))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(only_identifiers=True):
                if item.collection not in ALLOWED_COLLECTION:
                    continue
                art_ids.add('%s-%s' % (item.code, item.collection))

            # Ids to remove
            remove_ids = ind_ids - art_ids

            for id in remove_ids:
                self.solr.delete('id:%s' % id, commit=True)

            logger.info("List of removed ids: %s" % remove_ids)

        else:

            # Get article identifiers

            logger.info("Indexing in {0}".format(self.solr.url))

            for document in art_meta.documents(
                collection=self.args.collection,
                issn=self.args.issn,
                from_date=self.format_date(self.args.from_date),
                until_date=self.format_date(self.args.until_date)
            ):

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document), commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()