def est_smooth_kernel(txt): # Initialize FWHM kernels = [] ctxt = re.sub(r'[():=]', '', txt) for ptn in smooth_spatial_ptn: matches = rex_flex(ptn, ctxt, re.finditer) for match in matches: kernel_str = match.groupdict()['kernel'] # Handle ... six_match = re.search('(\d+?)6(\d+?)6(\d+)', kernel_str) if six_match: if len(set(six_match.groups())) == 1: kernel_str = six_match.groups()[-1] # Skip if match preceded by number; prevents matches on input like # "6 .5 mm fwhm" context_back, _ = rex_ctx(match, ctxt, nchar_pre=10, nchar_post=0) before_group = context_back.replace(match.group(), '') if re.search(r'[.\d]\s+$', before_group): continue # Skip if not float try: kernel = float(kernel_str) except (ValueError, TypeError): continue # Skip if implausible if kernel > MAX_FWHM: continue context, span = rex_ctx(match, ctxt) group = match.group() kernels.append({ 'value': kernel, 'context': context, 'group': group, 'span': span, }) return kernels
def parse_version_row(row): """Get version number and labels from version table row. :return: Tuple of (number, [labels]) """ cols = row.find_all('td') if not cols: return version_number = cols[0].text version_name = cols[1].text version_values = [version_number] if version_name: version_values.append(version_name) version_number = re.sub( r'matlab', '', version_number, flags=re.I ).strip() return version_number, version_values
def parse_publication_date(date_text): date_text = re.sub(month_range_pattern, '\\1', date_text) try: return dateparser.parse(date_text) except (TypeError, ValueError, AttributeError): return None
def est_highpass_cutoff(txt): """ """ cutoffs = [] ctxt = re.sub('[():=]', '', txt) for ptn in highpass_filter_bool_ptn: matches = rex_flex(ptn, ctxt, re.finditer) for match in matches: context, span = rex_ctx(match, ctxt) # Skip if negative patterns match stop = False for neg_ptn in highpass_filter_neg_ptn: if rex_flex(neg_ptn, context): stop = True break if stop: continue # Skip if wide-range negative patterns match context_wide, _ = rex_ctx(match, ctxt, nchar=500) for neg_ptn in highpass_filter_wide_neg_ptn: if rex_flex(neg_ptn, context_wide): stop = True break if stop: continue context_search, _ = rex_ctx(match, ctxt, nchar_pre=0) # Match on fraction pattern (e.g. 1 / 128 Hz) matches = rex_flex( highpass_filter_values_fraction, context_search, fun=re.finditer ) matches = list(matches) if len(matches) == 1: match = matches[0] group = match.group() numerator = match.groupdict()['num'] denominator = match.groupdict()['dnm'] if (numerator.startswith('0') and not numerator.startswith('0.')) or \ (denominator.startswith('0') and not denominator.startswith('0.')): continue try: numerator = float(numerator) denominator = float(denominator) except (ValueError, TypeError): continue # Avoid zero-division errors if numerator == 0 or denominator == 0: continue cutoff = numerator / denominator units = match.groupdict()['units'] if units.lower() == 'hz': cutoff = 1 / cutoff cutoffs.append({ 'value': cutoff, 'context': context, 'group': group, 'span': span, }) # Stop if fraction matches continue # Match on single-value pattern (e.g. 0.05 Hz) matches = rex_flex( highpass_filter_values, context_search, fun=re.finditer ) matches = list(matches) if len(matches) == 1: match = matches[0] group = match.group() cutoff = match.groupdict()['cutoff'] if cutoff.startswith('0') and not cutoff.startswith('0.'): continue try: cutoff = float(cutoff) except (ValueError, TypeError): continue if cutoff == 0: continue units = match.groupdict()['units'] if units.lower() == 'hz': cutoff = 1 / cutoff cutoffs.append({ 'value': cutoff, 'context': context, 'group': group, 'span': span, }) return cutoffs
def get_version_regex(value, flags=re.I): for sub in version_subs: value = re.sub(sub[0], sub[1], value, flags=flags) return value
def get_matlab_versions(overwrite=False): """Get MATLAB versions from Wikipedia. :param overwrite: Overwrite existing data? :return: MATLAB versions """ # Get version file version_file = os.path.join(trendpath.data_dir, 'matlab-versions.shelf') # Used saved versions if version file exists and not overwrite if os.path.exists(version_file) and not overwrite: shelf = shelve.open(version_file) versions = shelf['versions'] shelf.close() return versions # Open Wikipedia page response = requests.get('http://en.wikipedia.org/wiki/MATLAB') soup = BeautifulSoup(response.content) # Find "Release History" table history_headline = soup.find(id='Release_history') history_table = history_headline.find_next( 'table', class_=re.compile(r'wikitable'), ) history_row = history_table.find_all('tr') # Initialize Matlab versions versions = {} for row in history_row[1:]: # Get <td> elements tds = row.findAll('td') # Get version number version_number = tds[0].text version_number = re.sub(r'matlab\s+', '', version_number, flags=re.I) # Get version name version_name = tds[1].text # Make "r" in e.g. "r2007a" optional version_name = re.sub('r', 'r?', version_name, flags=re.I) # "Service Pack" -> "sp" version_name = re.sub( r'{dlm}(sp|service pack){dlm}'.format(dlm=delimiter), 'sp', version_name, flags=re.I ) # Add to versions versions[version_number] = [version_number] if version_name: versions[version_number].append(version_name) # Save results to version file shelf = shelve.open(version_file) shelf['versions'] = versions shelf.close() # Return versions return versions
category = 'tool' from neurotrends.config import re from neurotrends.tagger import RexComboVersionTagger from misc import version_separator spss = RexComboVersionTagger( 'spss', [ r'\bSPSS\b', r'\bPASW\b', ], version_separator, flags=re.VERBOSE, arbitrary_rex=r'(?P<version>\d+(\.\d+){0,2})', post_proc=lambda v: re.sub(r'\.0?$', '', v), ) statistica = RexComboVersionTagger( 'statistica', [ r'statistica\b(?!\s+l)(?!.{,50}?sinica)', ], version_separator, flags=re.VERBOSE, arbitrary_rex=r'(?P<version>\d+(\.\d+){,1})', post_proc=lambda v: re.sub(r'\.0?$', '', v), ) sas = RexComboVersionTagger( 'sas',
def tag(self, tag_groups=None, overwrite=False, save=True): """Add tags to article. :param list tag_groups: List of TagGroup objects :param bool overwrite: Overwrite existing tags :param bool save: Save record after update :return list: New or modified extracted tags """ tag_groups = tag_groups or pattern.tag_groups.values() if overwrite: self.tags = [] existing_tags = [] else: existing_tags = [ tagger.Tag(tag) for tag in self.tags ] new_tags = [] self.verify(save=False) for document_type in self.verified: document_field = DOCUMENT_TYPES_TO_FIELDS[document_type] document = getattr(self, document_field) # Quit if document not set if document is None: continue doc = document.read() # Quit if document empty or fails verification if not doc: continue # Clean document text # TODO: Refactor as helper function doc = doc.replace(u'\u2044', '/') doc = doc.replace(u'\u2212', '-') doc = re.sub(r'[\s\-,]+', ' ', doc) for tag_group in tag_groups: # Extract tags tags = tagger.tag(tag_group, doc) for tag in tags: # Build context documents context_data = {document_type: tag['context']} group_data = {document_type: tag['group']} span_data = {document_type: tag['span']} # Update existing tag with context if tag in existing_tags: idx = existing_tags.index(tag) if document_type not in existing_tags[idx]['context']: existing_tags[idx]['context'].update(context_data) existing_tags[idx]['group'].update(group_data) existing_tags[idx]['span'].update(span_data) new_tags.append(existing_tags[idx]) # Create new tag in database else: tag['context'] = context_data tag['group'] = group_data tag['span'] = span_data existing_tags.append(tag) new_tags.append(tag) # Cast tags to dictionaries for ODM compatibility self.tags = [ dict(tag) for tag in existing_tags ] # Update tagged date self.date_last_tagged = datetime.datetime.utcnow() if save: self.save() return new_tags
def post_proc(value): return re.sub(r'\.[\.0]+$', '', value)
def clean_delimiters(text): return re.sub(r'[\s\-,]+', ' ', text)
def est_highpass_cutoff(txt): """ """ cutoffs = [] ctxt = re.sub(r'[():=]', '', txt) for ptn in highpass_filter_bool_ptn: matches = rex_flex(ptn, ctxt, re.finditer) for match in matches: context, span = rex_ctx(match, ctxt) # Skip if negative patterns match stop = False for neg_ptn in highpass_filter_neg_ptn: if rex_flex(neg_ptn, context): stop = True break if stop: continue # Skip if wide-range negative patterns match context_wide, _ = rex_ctx(match, ctxt, nchar=500) for neg_ptn in highpass_filter_wide_neg_ptn: if rex_flex(neg_ptn, context_wide): stop = True break if stop: continue context_search, _ = rex_ctx(match, ctxt, nchar_pre=0) # Match on fraction pattern (e.g. 1 / 128 Hz) matches = rex_flex(highpass_filter_values_fraction, context_search, fun=re.finditer) matches = list(matches) if len(matches) == 1: match = matches[0] group = match.group() numerator = match.groupdict()['num'] denominator = match.groupdict()['dnm'] if (numerator.startswith('0') and not numerator.startswith('0.')) or \ (denominator.startswith('0') and not denominator.startswith('0.')): continue try: numerator = float(numerator) denominator = float(denominator) except (ValueError, TypeError): continue # Avoid zero-division errors if numerator == 0 or denominator == 0: continue cutoff = numerator / denominator units = match.groupdict()['units'] if units.lower() == 'hz': cutoff = 1 / cutoff cutoffs.append({ 'value': cutoff, 'context': context, 'group': group, 'span': span, }) # Stop if fraction matches continue # Match on single-value pattern (e.g. 0.05 Hz) matches = rex_flex(highpass_filter_values, context_search, fun=re.finditer) matches = list(matches) if len(matches) == 1: match = matches[0] group = match.group() cutoff = match.groupdict()['cutoff'] if cutoff.startswith('0') and not cutoff.startswith('0.'): continue try: cutoff = float(cutoff) except (ValueError, TypeError): continue if cutoff == 0: continue units = match.groupdict()['units'] if units.lower() == 'hz': cutoff = 1 / cutoff cutoffs.append({ 'value': cutoff, 'context': context, 'group': group, 'span': span, }) return cutoffs