Ejemplo n.º 1
0
def est_smooth_kernel(txt):

    # Initialize FWHM
    kernels = []

    ctxt = re.sub(r'[():=]', '', txt)

    for ptn in smooth_spatial_ptn:

        matches = rex_flex(ptn, ctxt, re.finditer)

        for match in matches:

            kernel_str = match.groupdict()['kernel']

            # Handle ...
            six_match = re.search('(\d+?)6(\d+?)6(\d+)', kernel_str)
            if six_match:
                if len(set(six_match.groups())) == 1:
                    kernel_str = six_match.groups()[-1]

            # Skip if match preceded by number; prevents matches on input like
            # "6 .5 mm fwhm"
            context_back, _ = rex_ctx(match, ctxt, nchar_pre=10, nchar_post=0)
            before_group = context_back.replace(match.group(), '')
            if re.search(r'[.\d]\s+$', before_group):
                continue

            # Skip if not float
            try:
                kernel = float(kernel_str)
            except (ValueError, TypeError):
                continue

            # Skip if implausible
            if kernel > MAX_FWHM:
                continue

            context, span = rex_ctx(match, ctxt)
            group = match.group()
            kernels.append({
                'value': kernel,
                'context': context,
                'group': group,
                'span': span,
            })

    return kernels
Ejemplo n.º 2
0
def est_smooth_kernel(txt):

    # Initialize FWHM
    kernels = []

    ctxt = re.sub(r'[():=]', '', txt)

    for ptn in smooth_spatial_ptn:

        matches = rex_flex(ptn, ctxt, re.finditer)

        for match in matches:

            kernel_str = match.groupdict()['kernel']

            # Handle ...
            six_match = re.search('(\d+?)6(\d+?)6(\d+)', kernel_str)
            if six_match:
                if len(set(six_match.groups())) == 1:
                    kernel_str = six_match.groups()[-1]

            # Skip if match preceded by number; prevents matches on input like
            # "6 .5 mm fwhm"
            context_back, _ = rex_ctx(match, ctxt, nchar_pre=10, nchar_post=0)
            before_group = context_back.replace(match.group(), '')
            if re.search(r'[.\d]\s+$', before_group):
                continue

            # Skip if not float
            try:
                kernel = float(kernel_str)
            except (ValueError, TypeError):
                continue

            # Skip if implausible
            if kernel > MAX_FWHM:
                continue

            context, span = rex_ctx(match, ctxt)
            group = match.group()
            kernels.append({
                'value': kernel,
                'context': context,
                'group': group,
                'span': span,
            })

    return kernels
Ejemplo n.º 3
0
def parse_version_row(row):
    """Get version number and labels from version table row.

    :return: Tuple of (number, [labels])
    """
    cols = row.find_all('td')
    if not cols:
        return

    version_number = cols[0].text
    version_name = cols[1].text

    version_values = [version_number]
    if version_name:
        version_values.append(version_name)

    version_number = re.sub(
        r'matlab', '', version_number, flags=re.I
    ).strip()

    return version_number, version_values
Ejemplo n.º 4
0
def parse_publication_date(date_text):
    date_text = re.sub(month_range_pattern, '\\1', date_text)
    try:
        return dateparser.parse(date_text)
    except (TypeError, ValueError, AttributeError):
        return None
Ejemplo n.º 5
0
def est_highpass_cutoff(txt):
    """

    """
    cutoffs = []

    ctxt = re.sub('[():=]', '', txt)

    for ptn in highpass_filter_bool_ptn:

        matches = rex_flex(ptn, ctxt, re.finditer)

        for match in matches:

            context, span = rex_ctx(match, ctxt)

            # Skip if negative patterns match
            stop = False
            for neg_ptn in highpass_filter_neg_ptn:
                if rex_flex(neg_ptn, context):
                    stop = True
                    break
            if stop:
                continue

            # Skip if wide-range negative patterns match
            context_wide, _ = rex_ctx(match, ctxt, nchar=500)
            for neg_ptn in highpass_filter_wide_neg_ptn:
                if rex_flex(neg_ptn, context_wide):
                    stop = True
                    break
            if stop:
                continue

            context_search, _ = rex_ctx(match, ctxt, nchar_pre=0)

            # Match on fraction pattern (e.g. 1 / 128 Hz)
            matches = rex_flex(
                highpass_filter_values_fraction, context_search,
                fun=re.finditer
            )
            matches = list(matches)

            if len(matches) == 1:

                match = matches[0]
                group = match.group()

                numerator = match.groupdict()['num']
                denominator = match.groupdict()['dnm']

                if (numerator.startswith('0') and not numerator.startswith('0.')) or \
                        (denominator.startswith('0') and not denominator.startswith('0.')):
                    continue

                try:
                    numerator = float(numerator)
                    denominator = float(denominator)
                except (ValueError, TypeError):
                    continue

                # Avoid zero-division errors
                if numerator == 0 or denominator == 0:
                    continue

                cutoff = numerator / denominator

                units = match.groupdict()['units']
                if units.lower() == 'hz':
                    cutoff = 1 / cutoff

                cutoffs.append({
                    'value': cutoff,
                    'context': context,
                    'group': group,
                    'span': span,
                })

                # Stop if fraction matches
                continue

            # Match on single-value pattern (e.g. 0.05 Hz)
            matches = rex_flex(
                highpass_filter_values, context_search, fun=re.finditer
            )
            matches = list(matches)

            if len(matches) == 1:

                match = matches[0]
                group = match.group()

                cutoff = match.groupdict()['cutoff']

                if cutoff.startswith('0') and not cutoff.startswith('0.'):
                    continue

                try:
                    cutoff = float(cutoff)
                except (ValueError, TypeError):
                    continue

                if cutoff == 0:
                    continue

                units = match.groupdict()['units']
                if units.lower() == 'hz':
                    cutoff = 1 / cutoff

                cutoffs.append({
                    'value': cutoff,
                    'context': context,
                    'group': group,
                    'span': span,
                })

    return cutoffs
Ejemplo n.º 6
0
def parse_publication_date(date_text):
    date_text = re.sub(month_range_pattern, '\\1', date_text)
    try:
        return dateparser.parse(date_text)
    except (TypeError, ValueError, AttributeError):
        return None
Ejemplo n.º 7
0
def get_version_regex(value, flags=re.I):
    for sub in version_subs:
        value = re.sub(sub[0], sub[1], value, flags=flags)
    return value
Ejemplo n.º 8
0
def get_matlab_versions(overwrite=False):
    """Get MATLAB versions from Wikipedia.

    :param overwrite: Overwrite existing data?
    :return: MATLAB versions

    """
    # Get version file
    version_file = os.path.join(trendpath.data_dir, 'matlab-versions.shelf')

    # Used saved versions if version file exists and not overwrite
    if os.path.exists(version_file) and not overwrite:
        shelf = shelve.open(version_file)
        versions = shelf['versions']
        shelf.close()
        return versions

    # Open Wikipedia page
    response = requests.get('http://en.wikipedia.org/wiki/MATLAB')
    soup = BeautifulSoup(response.content)

    # Find "Release History" table
    history_headline = soup.find(id='Release_history')
    history_table = history_headline.find_next(
        'table',
        class_=re.compile(r'wikitable'),
    )
    history_row = history_table.find_all('tr')

    # Initialize Matlab versions
    versions = {}

    for row in history_row[1:]:

        # Get <td> elements
        tds = row.findAll('td')

        # Get version number
        version_number = tds[0].text
        version_number = re.sub(r'matlab\s+', '', version_number, flags=re.I)

        # Get version name
        version_name = tds[1].text

        # Make "r" in e.g. "r2007a" optional
        version_name = re.sub('r', 'r?', version_name, flags=re.I)

        # "Service Pack" -> "sp"
        version_name = re.sub(
            r'{dlm}(sp|service pack){dlm}'.format(dlm=delimiter),
            'sp',
            version_name,
            flags=re.I
        )

        # Add to versions
        versions[version_number] = [version_number]
        if version_name:
            versions[version_number].append(version_name)

    # Save results to version file
    shelf = shelve.open(version_file)
    shelf['versions'] = versions
    shelf.close()

    # Return versions
    return versions
Ejemplo n.º 9
0
category = 'tool'

from neurotrends.config import re
from neurotrends.tagger import RexComboVersionTagger
from misc import version_separator

spss = RexComboVersionTagger(
    'spss',
    [
        r'\bSPSS\b',
        r'\bPASW\b',
    ],
    version_separator,
    flags=re.VERBOSE,
    arbitrary_rex=r'(?P<version>\d+(\.\d+){0,2})',
    post_proc=lambda v: re.sub(r'\.0?$', '', v),
)

statistica = RexComboVersionTagger(
    'statistica',
    [
        r'statistica\b(?!\s+l)(?!.{,50}?sinica)',
    ],
    version_separator,
    flags=re.VERBOSE,
    arbitrary_rex=r'(?P<version>\d+(\.\d+){,1})',
    post_proc=lambda v: re.sub(r'\.0?$', '', v),
)

sas = RexComboVersionTagger(
    'sas',
Ejemplo n.º 10
0
    def tag(self, tag_groups=None, overwrite=False, save=True):
        """Add tags to article.

        :param list tag_groups: List of TagGroup objects
        :param bool overwrite: Overwrite existing tags
        :param bool save: Save record after update
        :return list: New or modified extracted tags

        """
        tag_groups = tag_groups or pattern.tag_groups.values()

        if overwrite:
            self.tags = []
            existing_tags = []
        else:
            existing_tags = [
                tagger.Tag(tag)
                for tag in self.tags
            ]

        new_tags = []

        self.verify(save=False)

        for document_type in self.verified:

            document_field = DOCUMENT_TYPES_TO_FIELDS[document_type]
            document = getattr(self, document_field)

            # Quit if document not set
            if document is None:
                continue

            doc = document.read()

            # Quit if document empty or fails verification
            if not doc:
                continue

            # Clean document text
            # TODO: Refactor as helper function
            doc = doc.replace(u'\u2044', '/')
            doc = doc.replace(u'\u2212', '-')
            doc = re.sub(r'[\s\-,]+', ' ', doc)

            for tag_group in tag_groups:

                # Extract tags
                tags = tagger.tag(tag_group, doc)

                for tag in tags:

                    # Build context documents
                    context_data = {document_type: tag['context']}
                    group_data = {document_type: tag['group']}
                    span_data = {document_type: tag['span']}

                    # Update existing tag with context
                    if tag in existing_tags:
                        idx = existing_tags.index(tag)
                        if document_type not in existing_tags[idx]['context']:
                            existing_tags[idx]['context'].update(context_data)
                            existing_tags[idx]['group'].update(group_data)
                            existing_tags[idx]['span'].update(span_data)
                            new_tags.append(existing_tags[idx])
                    # Create new tag in database
                    else:
                        tag['context'] = context_data
                        tag['group'] = group_data
                        tag['span'] = span_data
                        existing_tags.append(tag)
                        new_tags.append(tag)

        # Cast tags to dictionaries for ODM compatibility
        self.tags = [
            dict(tag)
            for tag in existing_tags
        ]

        # Update tagged date
        self.date_last_tagged = datetime.datetime.utcnow()

        if save:
            self.save()

        return new_tags
Ejemplo n.º 11
0
def post_proc(value):
    return re.sub(r'\.[\.0]+$', '', value)
Ejemplo n.º 12
0
def clean_delimiters(text):
    return re.sub(r'[\s\-,]+', ' ', text)
Ejemplo n.º 13
0
def post_proc(value):
    return re.sub(r'\.[\.0]+$', '', value)
Ejemplo n.º 14
0
def est_highpass_cutoff(txt):
    """

    """
    cutoffs = []

    ctxt = re.sub(r'[():=]', '', txt)

    for ptn in highpass_filter_bool_ptn:

        matches = rex_flex(ptn, ctxt, re.finditer)

        for match in matches:

            context, span = rex_ctx(match, ctxt)

            # Skip if negative patterns match
            stop = False
            for neg_ptn in highpass_filter_neg_ptn:
                if rex_flex(neg_ptn, context):
                    stop = True
                    break
            if stop:
                continue

            # Skip if wide-range negative patterns match
            context_wide, _ = rex_ctx(match, ctxt, nchar=500)
            for neg_ptn in highpass_filter_wide_neg_ptn:
                if rex_flex(neg_ptn, context_wide):
                    stop = True
                    break
            if stop:
                continue

            context_search, _ = rex_ctx(match, ctxt, nchar_pre=0)

            # Match on fraction pattern (e.g. 1 / 128 Hz)
            matches = rex_flex(highpass_filter_values_fraction,
                               context_search,
                               fun=re.finditer)
            matches = list(matches)

            if len(matches) == 1:

                match = matches[0]
                group = match.group()

                numerator = match.groupdict()['num']
                denominator = match.groupdict()['dnm']

                if (numerator.startswith('0') and not numerator.startswith('0.')) or \
                        (denominator.startswith('0') and not denominator.startswith('0.')):
                    continue

                try:
                    numerator = float(numerator)
                    denominator = float(denominator)
                except (ValueError, TypeError):
                    continue

                # Avoid zero-division errors
                if numerator == 0 or denominator == 0:
                    continue

                cutoff = numerator / denominator

                units = match.groupdict()['units']
                if units.lower() == 'hz':
                    cutoff = 1 / cutoff

                cutoffs.append({
                    'value': cutoff,
                    'context': context,
                    'group': group,
                    'span': span,
                })

                # Stop if fraction matches
                continue

            # Match on single-value pattern (e.g. 0.05 Hz)
            matches = rex_flex(highpass_filter_values,
                               context_search,
                               fun=re.finditer)
            matches = list(matches)

            if len(matches) == 1:

                match = matches[0]
                group = match.group()

                cutoff = match.groupdict()['cutoff']

                if cutoff.startswith('0') and not cutoff.startswith('0.'):
                    continue

                try:
                    cutoff = float(cutoff)
                except (ValueError, TypeError):
                    continue

                if cutoff == 0:
                    continue

                units = match.groupdict()['units']
                if units.lower() == 'hz':
                    cutoff = 1 / cutoff

                cutoffs.append({
                    'value': cutoff,
                    'context': context,
                    'group': group,
                    'span': span,
                })

    return cutoffs