Exemple #1
0
    def crawl(self, directory=None, source=None, meta={}):
        source = source or directory
        source = Source.create({
            'foreign_id': 'directory:%s' % slugify(source),
            'label': source
        })
        db.session.commit()
        source_id = source.id

        if os.path.isfile(directory):
            self.crawl_file(source_id, directory, meta)

        directory = directory or os.getcwd()
        directory = directory.encode('utf-8')
        for (dirname, dirs, files) in os.walk(directory):
            dirparts = [
                d for d in dirname.split(os.path.sep) if d in SKIP_DIRECTORIES
            ]
            if len(dirparts):
                continue
            log.info("Descending: %r", dirname)
            for file_name in files:
                dirname = string_value(dirname)
                file_name = string_value(file_name)
                if file_name in SKIP_FILES:
                    continue
                file_path = os.path.join(dirname, file_name)
                self.crawl_file(source_id, file_path, meta)
Exemple #2
0
 def source(self):
     if not hasattr(self, '_source'):
         self._source = Source.create({
             'foreign_id': self.SOURCE_ID,
             'label': self.SOURCE_LABEL or self.SOURCE_ID
         })
         db.session.commit()
     return self._source
Exemple #3
0
 def crawl_source(self, engine, foreign_id, data):
     source = Source.create({
         'foreign_id': foreign_id,
         'label': data.get('label')
     })
     db.session.commit()
     meta_base = data.get('meta', {})
     for name, query in data.get('queries', {}).items():
         self.crawl_query(engine, source, meta_base, name, query)
Exemple #4
0
 def crawl_source(self, engine, foreign_id, data):
     source = Source.create({
         'foreign_id': foreign_id,
         'label': data.get('label')
     })
     db.session.commit()
     meta_base = data.get('meta', {})
     for name, query in data.get('queries', {}).items():
         self.crawl_query(engine, source, meta_base, name, query)
Exemple #5
0
 def source(self):
     if not hasattr(self, '_source'):
         self._source = Source.create({
             'foreign_id': self.SOURCE_ID,
             'label': self.SOURCE_LABEL or self.SOURCE_ID
         })
         db.session.commit()
     db.session.add(self._source)
     return self._source
Exemple #6
0
    def crawl(self):
        for base_url in SITES:
            print 'Working on base_url: {}'.format(base_url)
            self.attributes = SITES[base_url]
            self.label = self.attributes['label']
            Source.create({'label': self.label, 'foreign_id': 'blacklight'})
            db.session.commit()
            self.failed_articles = 0
            page_count = self.get_page_count(base_url)
            print "Pages: {}".format(page_count)
            page_number = 1
            while (page_number <= page_count):
                if self.failed_articles >= FAILED_LIMIT:
                    log.warning('Failure limit reach: {}'.format(FAILED_LIMIT))
                    break

                self.crawl_page(base_url, page_number, page_count)
                page_number += 1
Exemple #7
0
    def crawl(self):
        for base_url in SITES:
            print 'Working on base_url: {}'.format(base_url)
            self.attributes = SITES[base_url]
            self.label = self.attributes['label']
            Source.create({
                'label': self.label,
                'foreign_id': 'blacklight'
            })
            db.session.commit()
            self.failed_articles = 0
            page_count = self.get_page_count(base_url)
            print "Pages: {}".format(page_count)
            page_number = 1
            while (page_number <= page_count):
                if self.failed_articles >= FAILED_LIMIT:
                    log.warning('Failure limit reach: {}'.format(FAILED_LIMIT))
                    break

                self.crawl_page(base_url, page_number, page_count)
                page_number += 1
Exemple #8
0
    def crawl(self, directory=None, source=None):
        source = source or directory
        source = Source.create({
            'foreign_id': 'directory:%s' % slugify(source),
            'label': source
        })

        if os.path.isfile(directory):
            meta = self.metadata()
            meta.file_name = directory
            self.emit_file(source, meta, directory)

        directory = directory or os.getcwd()
        directory = directory.encode('utf-8')
        for (dirname, dirs, files) in os.walk(directory):
            log.info("Descending: %r", dirname)
            for file_name in files:
                if file_name in SKIP_FILES:
                    continue
                file_path = os.path.join(dirname, file_name)
                if not os.path.isfile(file_path):
                    continue
                try:
                    meta = self.metadata()
                    if isinstance(file_name, six.text_type):
                        meta.source_path = file_path
                    else:
                        enc = chardet.detect(file_name)
                        enc = enc.get('encoding')
                        try:
                            meta.source_path = file_path.decode(enc)
                        except:
                            meta.source_path = file_path.decode('ascii', 'ignore')

                    self.emit_file(source, meta, file_path)
                except Exception as ex:
                    log.exception(ex)
                    process.exception(process.INDEX, component=self.name,
                                      source_location=directory,
                                      source_id=source.id, exception=ex)
Exemple #9
0
    def crawl_item(self, item, source):
        source_data = item.meta.get('source', {})
        source_fk = source_data.pop('foreign_id', source)
        if source_fk is None:
            raise ValueError("No foreign_id for source given: %r" % item)
        if source_fk not in self.sources:
            label = source_data.get('label', source_fk)
            self.sources[source_fk] = Source.create({
                'foreign_id': source_fk,
                'label': label
            })
            if source_data.get('public'):
                Permission.grant_foreign(self.sources[source_fk],
                                         Role.SYSTEM_GUEST, True, False)
            db.session.commit()

        log.info('Import: %r', item.identifier)
        meta = self.normalize_metadata(item)
        ingest_file(self.sources[source_fk].id,
                    meta,
                    item.data_path,
                    move=False)
Exemple #10
0
    def crawl(self, directory=None, source=None):
        source = source or directory
        source = Source.create({
            'foreign_id': 'directory:%s' % slugify(source),
            'label': source
        })

        if os.path.isfile(directory):
            meta = self.metadata()
            meta.file_name = directory
            self.emit_file(source, meta, directory)

        directory = directory or os.getcwd()
        directory = directory.encode('utf-8')
        for (dirname, dirs, files) in os.walk(directory):
            log.info("Descending: %r", dirname)
            for file_name in files:
                if file_name in SKIP_FILES:
                    continue
                file_path = os.path.join(dirname, file_name)
                if not os.path.isfile(file_path):
                    continue
                try:
                    meta = self.metadata()
                    if isinstance(file_name, six.text_type):
                        meta.source_path = file_path
                    else:
                        enc = chardet.detect(file_name)
                        enc = enc.get('encoding')
                        try:
                            meta.source_path = file_path.decode(enc)
                        except:
                            meta.source_path = file_path.decode('ascii', 'ignore')

                    self.emit_file(source, meta, file_path)
                except Exception as ex:
                    log.exception(ex)
Exemple #11
0
def create():
    authz.require(authz.logged_in())
    src = Source.create(request_data(), current_user)
    db.session.commit()
    return view(src.slug)
Exemple #12
0
 def create_source(self, **data):
     if 'foreign_id' not in data:
         data['foreign_id'] = self.name
     return Source.create(data)
Exemple #13
0
 def create_source(self, **data):
     if 'foreign_id' not in data:
         data['foreign_id'] = self.name
     return Source.create(data)
Exemple #14
0
def create():
    authz.require(authz.is_admin())
    src = Source.create(request_data(), current_user)
    db.session.commit()
    return view(src.slug)