Python read_uri Examples, spectrometer.processor.utils.read_uri Python Examples

Example #1

0

Show file

File: main.py Project: dave-tucker/spectrometer

def _read_official_programs_yaml(program_list_uri, release_names):
    LOG.debug('Process list of programs from uri: %s', program_list_uri)
    content = yaml.safe_load(utils.read_uri(program_list_uri))
    module_groups = collections.defaultdict(
        lambda: {'modules': [], 'releases': collections.defaultdict(list)})

    bootstrap = module_groups['official-bootstrap']
    bootstrap['tag'] = 'project_type'
    bootstrap['module_group_name'] = 'official-bootstrap'
    incubation = module_groups['official-incubation']
    incubation['tag'] = 'project_type'
    incubation['module_group_name'] = 'official-incubation'
    mature = module_groups['official-mature']
    mature['tag'] = 'project_type'
    mature['module_group_name'] = 'official-mature'
    core = module_groups['official-core']
    core['tag'] = 'project_type'
    core['module_group_name'] = 'official-core'

    RELEASE_TAGS = ['bootstrapped-since', 'incubated-since',
                    'mature-since', 'core-since']

    for name, info in six.iteritems(content):
        # for one program
        group_id = name.lower()
        if 'codename' in info:
            name = '%s (%s)' % (info['codename'], name)
            group_id = '%s-group' % info['codename'].lower()

        module_groups[group_id]['module_group_name'] = name
        module_groups[group_id]['tag'] = 'program'

        for module in info['projects']:
            module_name = module['repo'].split('/')[1]

            module_groups[group_id]['modules'].append(module_name)
            project_type = 'official-other'
            if (any(key in module for key in RELEASE_TAGS)):
                releases = [r.lower() for r in release_names]
                for release_name in releases:

                    if release_name == module.get('bootstrapped-since'):
                        project_type = 'official-bootstrap'
                    elif release_name == module.get('incubated-since'):
                        project_type = 'official-incubation'
                    elif release_name == module.get('mature-since'):
                        project_type = 'official-mature'
                    elif release_name == module.get('core-since'):
                        project_type = 'official-core'

                    module_groups[project_type]['releases'][
                        release_name].append(module_name)
            else:
                module_groups['other']['modules'].append(module_name)

    # set ids for module groups
    for group_id, group in six.iteritems(module_groups):
        group['id'] = group_id

    return module_groups

Example #2

0

Show file

File: mps.py Project: ZigZg/spectrometer

    def _retrieve_member(self, member_id, html_parser):

        content = utils.read_uri(self.uri)

        if not content:
            return {}

        member = {}

        for rec in re.finditer(NAME_AND_DATE_PATTERN, content):
            result = rec.groupdict()

            member['member_id'] = member_id
            member['member_name'] = result['member_name']
            member['date_joined'] = result['date_joined']
            member['member_uri'] = self.uri
            break

        member['company_draft'] = '*independent'
        for rec in re.finditer(COMPANY_PATTERN, content):
            result = rec.groupdict()

            company_draft = html_parser.unescape(result['company_draft'])
            member['company_draft'] = company_draft

        return member

Example #3

0

Show file

File: mps.py Project: tnadeau/spectrometer

    def _retrieve_member(self, member_id, html_parser):

        content = utils.read_uri(self.uri)

        if not content:
            return {}

        member = {}

        for rec in re.finditer(NAME_AND_DATE_PATTERN, content):
            result = rec.groupdict()

            member['member_id'] = member_id
            member['member_name'] = result['member_name']
            member['date_joined'] = result['date_joined']
            member['member_uri'] = self.uri
            break

        member['company_draft'] = '*independent'
        for rec in re.finditer(COMPANY_PATTERN, content):
            result = rec.groupdict()

            company_draft = html_parser.unescape(result['company_draft'])
            member['company_draft'] = company_draft

        return member

Example #4

0

Show file

File: mls.py Project: tnadeau/spectrometer

def _get_mail_archive_links(uri):
    content = utils.read_uri(uri)
    links = set(
        re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt(\.gz)?)',
                   content,
                   flags=re.IGNORECASE))
    # each link is a tuple due to having multiple groups in the re
    # we are only interested in the first one
    return [parse.urljoin(uri, link[0]) for link in links]

Example #5

0

Show file

File: mls.py Project: tnadeau/spectrometer

def _retrieve_mails(uri):
    LOG.debug('Retrieving mail archive from uri: %s', uri)
    content = utils.read_uri(uri)
    if not content:
        LOG.error('Error reading mail archive from uri: %s', uri)
        return

    # only gunzip if the uri has a .gz suffix
    matchgz = re.compile('\.txt\.gz')
    if matchgz.search(uri):
        LOG.debug('%s is a gzipped file', uri)
        gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content))
        content = gzip_fd.read()
    else:
        LOG.debug('%s is not a gzipped file', uri)

    LOG.debug('Mail archive is loaded, start processing')

    content += TRAILING_RECORD

    for rec in re.finditer(MAIL_BOX_PATTERN, content):
        email = rec.groupdict()
        email['author_email'] = email['author_email'].replace(' at ', '@', 1)
        if not utils.check_email_validity(email['author_email']):
            continue

        email['date'] = int(
            email_utils.mktime_tz(email_utils.parsedate_tz(email['date'])))

        for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
            collection = set()
            for item in re.finditer(pattern, email['body']):
                groups = item.groupdict()
                item_id = groups['id']
                if 'module' in groups:
                    item_id = groups['module'] + ':' + item_id
                    email['module'] = groups['module']
                collection.add(item_id)
            email[pattern_name] = list(collection)

        yield email

Example #6

0

Show file

File: mls.py Project: nilok/spectrometer

def _retrieve_mails(uri):
    LOG.debug("Retrieving mail archive from uri: %s", uri)
    content = utils.read_uri(uri)
    if not content:
        LOG.error("Error reading mail archive from uri: %s", uri)
        return

    # only gunzip if the uri has a .gz suffix
    matchgz = re.compile("\.txt\.gz")
    if matchgz.search(uri):
        LOG.debug("%s is a gzipped file", uri)
        gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content))
        content = gzip_fd.read()
    else:
        LOG.debug("%s is not a gzipped file", uri)

    LOG.debug("Mail archive is loaded, start processing")

    content += TRAILING_RECORD

    for rec in re.finditer(MAIL_BOX_PATTERN, content):
        email = rec.groupdict()
        email["author_email"] = email["author_email"].replace(" at ", "@", 1)
        if not utils.check_email_validity(email["author_email"]):
            continue

        email["date"] = int(email_utils.mktime_tz(email_utils.parsedate_tz(email["date"])))

        for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
            collection = set()
            for item in re.finditer(pattern, email["body"]):
                groups = item.groupdict()
                item_id = groups["id"]
                if "module" in groups:
                    item_id = groups["module"] + ":" + item_id
                    email["module"] = groups["module"]
                collection.add(item_id)
            email[pattern_name] = list(collection)

        yield email

Example #7

0

Show file

File: mls.py Project: nilok/spectrometer

def _get_mail_archive_links(uri):
    content = utils.read_uri(uri)
    links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt(\.gz)?)', content, flags=re.IGNORECASE))
    # each link is a tuple due to having multiple groups in the re
    # we are only interested in the first one
    return [parse.urljoin(uri, link[0]) for link in links]

Example #8

0

Show file

File: main.py Project: tnadeau/spectrometer

def _read_official_programs_yaml(program_list_uri, release_names):
    LOG.debug('Process list of programs from uri: %s', program_list_uri)
    content = yaml.safe_load(utils.read_uri(program_list_uri))
    module_groups = collections.defaultdict(
        lambda: {
            'modules': [],
            'releases': collections.defaultdict(list)
        })

    bootstrap = module_groups['official-bootstrap']
    bootstrap['tag'] = 'project_type'
    bootstrap['module_group_name'] = 'official-bootstrap'
    incubation = module_groups['official-incubation']
    incubation['tag'] = 'project_type'
    incubation['module_group_name'] = 'official-incubation'
    mature = module_groups['official-mature']
    mature['tag'] = 'project_type'
    mature['module_group_name'] = 'official-mature'
    core = module_groups['official-core']
    core['tag'] = 'project_type'
    core['module_group_name'] = 'official-core'

    RELEASE_TAGS = [
        'bootstrapped-since', 'incubated-since', 'mature-since', 'core-since'
    ]

    for name, info in six.iteritems(content):
        # for one program
        group_id = name.lower()
        if 'codename' in info:
            name = '%s (%s)' % (info['codename'], name)
            group_id = '%s-group' % info['codename'].lower()

        module_groups[group_id]['module_group_name'] = name
        module_groups[group_id]['tag'] = 'program'

        for module in info['projects']:
            module_name = module['repo'].split('/')[1]

            module_groups[group_id]['modules'].append(module_name)
            project_type = 'official-other'
            if (any(key in module for key in RELEASE_TAGS)):
                releases = [r.lower() for r in release_names]
                for release_name in releases:

                    if release_name == module.get('bootstrapped-since'):
                        project_type = 'official-bootstrap'
                    elif release_name == module.get('incubated-since'):
                        project_type = 'official-incubation'
                    elif release_name == module.get('mature-since'):
                        project_type = 'official-mature'
                    elif release_name == module.get('core-since'):
                        project_type = 'official-core'

                    module_groups[project_type]['releases'][
                        release_name].append(module_name)
            else:
                module_groups['other']['modules'].append(module_name)

    # set ids for module groups
    for group_id, group in six.iteritems(module_groups):
        group['id'] = group_id

    return module_groups