Esempio n. 1
0
def _match_by_edit_distance(full_text, text_to_match):
    text_to_match = text_to_match.replace("-LRB-", "(").replace("-RRB-", ")")
    text_to_match = text_to_match.replace("-LCB-", "{").replace("-RCB-", "}")
    text_to_match = re.sub(r'\[\\\]\\\)\]$', ')', text_to_match)

    try:
        end_point = (text_to_match.index(" ") if " " in text_to_match else len(text_to_match))
        potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in 
                             re.finditer(re.escape(text_to_match[0:end_point]), full_text, re.U | re.I)]
    except:
        import sys

        print(full_text)
        print()
        print(text_to_match)
        sys.exit(1)
        
    if len(potential_matches) == 0:
        potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in 
                             re.finditer(re.escape(text_to_match[0]), full_text, re.U)]
    if len(potential_matches) == 0:
        text_to_match = text_to_match.replace("(", "[")
        potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in 
                             re.finditer(re.escape(text_to_match[0]), full_text, re.U)]

    potential_matches = [(p[0:p.rindex(text_to_match[-1])+1] 
                          if text_to_match[-1] in p and len(p) > len(text_to_match)
                          else p)
                         for p in potential_matches]

    if len(potential_matches) == 0:
        # No idea why this would ever happen, but it does
        return text_to_match

    match_with_lowest_edit_distance = ""
    lowest_edit_distance = -1
    for match in potential_matches:
        e_d = edit_distance(match, text_to_match)
        if lowest_edit_distance == -1 or e_d <= lowest_edit_distance:
            lowest_edit_distance = e_d
            match_with_lowest_edit_distance = match

    result = match_with_lowest_edit_distance.strip()
    if text_to_match[-1] in result:
        while result[-1] != text_to_match[-1]:
            result = result[0:-1]
    elif text_to_match[-1] == '"' and re.search(r'["”\u201d]', result):
        while result[-1] not in ['"', '”', "\u201d"]:
            result = result[0:-1]
    elif text_to_match[-1] not in [']', '}', ')'] and text_to_match[-2:] != "..":
        while result[-1] != text_to_match[-1]:
            result += full_text[full_text.index(result) + len(result)][-1]

    return result
Esempio n. 2
0
 def _sustituirReg(self,aux,regOrig,regDest):
     #Resulta necesario el uso de regex que es mas potente y en ambos sentidos
     regAux="^"+regOrig+"$"
     for m in regex.finditer(regAux,aux,overlapped=True):
         if self.parentesisCoherentes1(m.groupdict()):
             aux2=regDest.format(**agregaCTX(m.groupdict())) + "\n"
             return aux2
     regAux="(?r)^"+regOrig+"$"
     for m in regex.finditer(regAux,aux,overlapped=True):
         if self.parentesisCoherentes1(m.groupdict()):
             aux2=regDest.format(**agregaCTX(m.groupdict())) + "\n"
             return aux2
     return aux
Esempio n. 3
0
    def parse_ind_vars(self):
        """Define values of independent variables by parsing first example of form:
        [var name]  [value 0]
                    [value 1]
                    ...
                    [value n]
                    [blank]
                *or*
        [text]
        :return:
        """
        selem = np.array([[1, 0],
                         [1, 1]])
        # some weird bug with np.pad and string dtype
        s_type = 'S%d' % (max([len(x) for y in self.sheet for x in y]) + 10)
        xs_values = np.zeros(np.array(self.sheet.shape) + 1, dtype=s_type)
        xs_values[:-1, :-1] = self.sheet

        mask = (xs_values[:, :2] != '').astype(int)
        mask[:, 1] *= 2
        mask_string = ''.join(['ABCD'[i] for i in mask.sum(axis=1)])

        ind_vars = {}
        for x in re.finditer('(DC+)[ABD]', mask_string):
            name = xs_values[x.span()[0], 0]
            values = xs_values[x.span()[0]:x.span()[1] - 1, 1]
            ind_vars[name] = list(values)

        self.ind_vars.update(ind_vars)
Esempio n. 4
0
    def _match(self, text):
        matches = []
        if self.mapping:
            seq = self.map.keys()
        else:
            seq = self.regex_sequence

        for r in seq:
            for matchobj in re.finditer(r, text, overlapped=True):
                groups = (matchobj.groupdict())
                result = {
                    'start': matchobj.start(),
                    'end': matchobj.end(),
                    'regex': r,
                    'groups':groups
                }

                if self.mapping:
                    for k, v in self.map[r].items():
                        if k not in result.keys():
                            result[k] = v

                matches.append(
                    result
                )

        return matches
Esempio n. 5
0
    def prune_by_precision(self, min_precision, text_data_pairs):
        """
        Removes patterns from the model that don't reach a minimum precision

        :param float min_precision: the minimum precision required of a pattern when applied to the given data
        :param collections.Iterable text_data_pairs: an iterable of `(text, data)` pairs where `text` is a string and
            `data` is an anafora.AnaforaData object
        """
        pattern_scores = collections.defaultdict(lambda: anafora.evaluate.Scores())
        for text, data in text_data_pairs:

            # collect the spans of each type of reference annotation
            reference_type_spans_map = collections.defaultdict(lambda: set())
            for annotation in data.annotations:
                reference_type_spans_map[annotation.type].add(annotation.spans)

            # make predictions with each pattern in the model
            for pattern in self.regex_type_attributes_map:
                predicted_spans = {((m.start(), m.end()),) for m in regex.finditer(pattern, text)}
                if predicted_spans:
                    predicted_type, _ = self.regex_type_attributes_map[pattern]

                    # update the scores for this pattern
                    pattern_scores[pattern].add(reference_type_spans_map[predicted_type], predicted_spans)

        # delete any pattern with a precision lower than the minimum requested
        for pattern, scores in pattern_scores.items():
            if scores.precision() < min_precision:
                del self.regex_type_attributes_map[pattern]
Esempio n. 6
0
    def parse(self, data, regex = None, encoding = "utf-8"):
        regex = regex or self.master

        is_unicode = appier.legacy.is_unicode(data)
        if not is_unicode: data = data.decode(encoding)

        nodes = []
        matches = regex.finditer(data)

        current = 0

        for match in matches:
            name = match.lastgroup
            parts = match.groupdict()

            start, end = match.span()
            if start > current:
                value = data[current:start]
                value = value.replace("\r", "")
                value = value.replace("\n", " ")
                if value: nodes.append(value)

            method = getattr(self, "parse_" + name)
            node = method(parts)
            nodes.append(node)

            current = end

        remaining = data[current:]
        remaining = remaining.replace("\r", "")
        remaining = remaining.replace("\n", " ")
        if remaining: nodes.append(remaining)

        return nodes
Esempio n. 7
0
def tokenize(text):

    """
    Yield tokens.

    Args:
        text (str): The original text.

    Yields:
        dict: The next token.
    """

    stem = SnowballStemmer('norwegian').stem
    tokens = regex.finditer('\p{L}+', text.lower())


    for offset, match in enumerate(tokens):

        # Get the raw token.
        unstemmed = match.group(0)

        yield { # Emit the token.
            'stemmed':      stem(unstemmed),
            'unstemmed':    unstemmed,
            'offset':       offset
        }
def determine_match(commentary_name, commentary_regex):
    issues = 0
    full_pattern = u'{} (?>[@!/*][^ ]* )*(?P<dh>[^ ]+)'.format(commentary_regex)
    full_mechaber = Root('../../Even_HaEzer.xml').get_base_text()
    error_counter = Counter()

    for siman_num, siman in enumerate(full_mechaber.get_simanim()):
        for seif_num, seif in enumerate(siman.get_child()):
            matches = regex.finditer(full_pattern, unicode(seif))

            for regex_match in matches:
                c_ref = Ref(u'{} {}:{}'.format(commentary_name, siman_num+1, getGematria(regex_match.group('ref'))))
                try:
                    c_text = c_ref.text('he').text.split()[0]
                except IndexError:
                    continue
                c_text = re.sub(u'[^\u05d0-\u05ea]', u'', c_text)
                dh_text = re.sub(u'[^\u05d0-\u05ea]', u'',regex_match.group('dh'))

                ratio = fuzz.ratio(dh_text, c_text)

                if ratio < 75.0:
                    issues += 1
                    print u"Potential mismatch:"
                    print u"Shulchan Arukh, Even HaEzer {}:{}   {}".format(siman_num+1, seif_num+1, dh_text)
                    print u"{}   {}".format(c_ref.normal(), c_text)
                    print u"Score: {}".format(ratio)
                    error_counter[(dh_text, c_text)] += 1
    print u"Total issues: {}".format(issues)
    return error_counter
Esempio n. 9
0
    def __init__(self, names, features={}, ftstr='', weights=None):
        """Construct a `Segment` object

        Args:
            names (list): ordered list of feature names
            features (dict): name-value pairs for specified features
            ftstr (unicode): a string, each /(+|0|-)\w+/ sequence of which is
                             interpreted as a feature specification
            weights (float): order list of feature weights/saliences
            """
        self.n2s = {-1: '-', 0: '0', 1: '+'}
        self.s2n = {k: v for (v, k) in self.n2s.items()}
        self.names = names
        """Set a feature specification"""
        self.data = {}
        for name in names:
            if name in features:
                self.data[name] = features[name]
            else:
                self.data[name] = 0
        for m in re.finditer(r'(\+|0|-)(\w+)', ftstr):
            v, k = m.groups()
            self.data[k] = self.s2n[v]
        if weights:
            self.weights = weights
        else:
            self.weights = [1 for _ in names]
def search_strand(pattern, sequence_to_scan, strand=1):
    '''
    take a sequence pattern (element) and find occurrences of that on the 
    provided, larger 5'-->3' sequence.
    Assumes strand is first unless provided.

    Tracks the start and end points of each occurrence, returning a list of
    that information where each element is a tuple of the start and end points
    along with the strand.

    Works with overlapped sequences because now 
    "regex.findall and regex.finditer support an ‘overlapped’ flag which 
    permits overlapped matches."
    , see https://pypi.python.org/pypi/regex/2018.02.21

    based on https://www.biostars.org/p/209383/ (specifically steve's answer)
    '''
    occurrences = []
    for match in regex.finditer(
        pattern.upper(), str(sequence_to_scan.upper()),overlapped=True):
        if strand == 1:
            start_pos = match.start() + 1
            end_pos = match.end() + 1
        else:
            start_pos = (len(sequence_to_scan) - match.start() ) + 1
            end_pos = (len(sequence_to_scan) - match.end() ) + 1
        # print (start_pos, '\t', end_pos, '\t',strand) # for debugging
        occurrences.append((start_pos, end_pos,strand))
    return occurrences
Esempio n. 11
0
    def tokenize(self, value):
        """
        Perform the tokenizing.

        Required Argument
        value -- The unicode string to tokenize.

        """
        t = Token()  # The token instance we will reuse
        if not self._gaps:
            # The default: expression matches are used as tokens
            for pos, match in enumerate(self._regexp.finditer(value)):
                yield t.update(match.group(0), index=(match.start(), match.end(),), position=pos)
        else:
            # When gaps=True, iterate through the matches and
            # yield the text between them.
            left = 0
            last_pos = 0
            for pos, match in enumerate(regex.finditer(self._regexp, value)):
                right, next = match.span()
                if right != 0:
                    yield t.update(value[left:right], position=pos, index=(left, right,))
                left = next
                last_pos = pos
            if left != len(value):
                yield t.update(value[left:], position=last_pos+1, index=(left, len(value),))
Esempio n. 12
0
 def findall_p_in_s(p,s):
     """"returns a series of matches for a pattern (p) in a str (s)"""""
     match_strs = regex.findall(p,s)
     #get pairs of left and right indexes
     match_indexes = [(i.start(0),i.end(0)) for i in regex.finditer(p,s)]
     all_p_in_s = [Match(match_strs[i],match_indexes[i][0],match_indexes[i][1]) for i in range(0,len(match_strs))]
     return all_p_in_s
Esempio n. 13
0
 def _find(self, *args):
   global index
   try:
     for match in regex.finditer(u'^.+?:\d+?:.*%s.*$' % args[0], index.data, regex.MULTILINE | regex.IGNORECASE | regex.V1, concurrent=True):
       self._print(match.group(0))
   except sre_constants.error, e:
     print
Esempio n. 14
0
    def plx_wrapper(text):
        before = text
        text = delimToPanlex(text)
        idx_list = [ex_match.start() for ex_match in re.finditer('⫷(?:ex|df)(?::\w{1,4}-\d{1,3})?⫸', text)]
        if len(idx_list) == 0:
            return process_synonyms(proc)(text)

        idx_list.append( len(text))
        if len(text[ 0:idx_list[0] ].strip()) > 0:
            idx_list.insert(0,0)
        final_exp = []

        for idx in range(len(idx_list) - 1):
            ex = text[ idx_list[idx] : idx_list[idx+1]]

            tag,ex_text,attributes = get_plx_fields(ex)
            result = proc(ex_text)
            result_match = re.search('(⫷(?:ex|df)(?::\w{1,4}-\d{1,3})?⫸)(.+)', result)
            
            if result_match:
                if len(result_match[1].strip()) > 0:
                    final_exp.append('%s%s' % (result,attributes))
            else:
                if len(result.strip()) > 0:
                    final_exp.append('%s%s%s' % (tag,result,attributes))

        final_exp = filter_unique_meanings(final_exp)
        text = ''.join(final_exp)
        return text
Esempio n. 15
0
 def process(self, file):
     with open(file, 'r', encoding='utf8') as f:
         string = f.read()
     dropping = True
     last_is_comment = False
     out = [self.head()]
     for m in regex.finditer(r'#[.:]\s*(?<comment>.*)|msgstr (?:"(?<msgstr>.*)"\n?)+', string):
         print(m[0], dropping)
         if dropping:
             if m['comment']:
                 if regex.match(r'[\s\p{punct}]*VAR', m['comment']):
                     continue
                 body_m = regex.search(r'(?is)<body.*', m['comment'])
                 if body_m:
                     dropping = False
                     out.append(body_m[0])
                 
         else:
             if m['comment']:
                 out.append(m['comment'])
                 last_is_comment = True
             else:
                 passage = self.unescape(' '.join(m.captures('msgstr')))
                 passage = passage.strip()
                 if passage:
                     if not last_is_comment:
                         out.append(' ')
                     last_is_comment = False
                     out.append(passage)
                 
     html_string = ''.join(out)
     return self.pretty_print(html_string)
Esempio n. 16
0
def plot_location(needle, haystack,
                  cluster_id=None, nbins=20, size=(17, 2), fname=None):
    """plot_location."""
    locs = []
    for h, s in haystack:
        for match in re.finditer(needle, s):
            s = match.start()
            e = match.end()
            m = s + (e - s) / 2
            locs.append(m)
    plt.figure(figsize=size)
    n, bins, patches = plt.hist(
        locs, nbins, normed=0, facecolor='blue', alpha=0.3)
    plt.grid()
    plt.title(needle)
    plt.xlabel('Position')
    plt.ylabel('Num occurrences')
    if fname:
        plt.draw()
        figname = '%s_loc_%d.png' % (fname, cluster_id)
        plt.savefig(
            figname, bbox_inches='tight', transparent=True, pad_inches=0)
    else:
        figname = None
        plt.show()
    plt.close()
    return figname
Esempio n. 17
0
def process_verses(chap_string, expression):
    """
    Take an entire chapters as a string and break up into verses. The new chapter index (number followed by
    a space) must be stripped out.

    :param chap_string: All verses in a chapter combined as one string.
    :param expression: A compiled regular expression with which to find new verses.
    :return: A list of strings (jagged array), with each verse as a separate string.
    """

    # find all new verses with the regular expression
    matches = expression.finditer(chap_string)

    # save start position of first verse and initiate list of verses
    try:
        start = next(matches)
    except StopIteration:
        return [chap_string]
    verses = []

    # loop through matches until StopIteration is raised at the last verse
    while True:
        try:
            end = next(matches)
            verses.append(chap_string[start.end()-1:end.start()])
            start = end

        except StopIteration:
            verses.append(chap_string[start.end()-1:])
            break

    # error correction - look for numbers in each verse and compare to verse number
    # This will differentiate between incorrectly formatted verses numbers and other numbers in the text.
    corrected_verses = []
    for index, verse in enumerate(verses):
        nums = re.finditer(u'\d{1,3} ', verse)
        good = True

        for num in nums:
            if int(num.group()) - index == 2:

                # add first verse
                corrected_verses.append(verse[:num.start()])

                # edit second verse
                second = verse[num.start():]
                second = second.replace(num.group(), num.group()[:len(num.group())])
                corrected_verses.append(second)
                good = False
                break

        if good:
            corrected_verses.append(verse)

    # strip out the * marker used to help differentiate numbers and verses
    for index, verse in enumerate(corrected_verses):
        corrected_verses[index] = verse.replace(u'*', u'')

    return corrected_verses
Esempio n. 18
0
def find_mofit(s, t):
    """
    :return: All locations of t as a substring of s.
    :param: s, t Two DNA strings
    """
    matches = finditer(t, s, overlapped=True)
    ans = [str(element.start() + 1) for element in matches]
    return " ".join(ans)
Esempio n. 19
0
def glycocheck(protein):
    ind = []
    import regex as re

    matches = re.finditer("N[^P][ST][^P]", protein, overlapped=True)
    for i in matches:
        ind.append(i.start() + 1)
    return ind
def MultipleApproxPatternMatch(inputset):
    patterns, d = inputset
    all_matched_index = []
    for eachp in patterns:
        reg = GetRE(eachp, d)
        for m in regex.finditer(reg, text, overlapped = True):
            all_matched_index.append(m.start())
    return all_matched_index
Esempio n. 21
0
 def finditer(
     pattern, string, flags=0, pos=None, endpos=None, overlapped=False,
     partial=False, concurrent=None, **kwargs
 ):
     return regex.finditer(
         _apply_search_backrefs(pattern, flags), string,
         flags, pos, endpos, overlapped, partial, concurrent, **kwargs
     )
    def get_word_groups_from_line(self, song_line):
        word_groups = []
        # for match in re.finditer(r"(?:^|\s)([A-Za-z,'! &?.]+)(?:\s|$)", song_line):
        for match in re.finditer(r"(?:^|\s)([^-0-9;]+)(?:\s|$)", song_line):
            # print(song_line + " => '" + m.group(0).strip() + "'")
            word_group = Group(match.start(), match.group(0).strip())
            word_groups.append(word_group)

        return word_groups
Esempio n. 23
0
 def key(word):
     out = []
     for m in regex.finditer('(' + '|'.join(a) + ')|.', word):
         if m.group(1):
             if alpha[m[0]] is not None:
                 out.append(alpha[m[0]])
         else:
             out.append(-1)
     return out
Esempio n. 24
0
def get_glycan_sites(seq, regex_pattern, strip_gap):
    if strip_gap == True:    
        seq = seq.replace("-", "")
    sites = []
    iterator = re.finditer(regex_pattern, seq, overlapped=True)
    for match in iterator:
        start, end = match.span()
        sites.append(start+1)
    return sites
Esempio n. 25
0
def parse_scad_callables(scad_code_str):
    callables = []

    # Note that this isn't comprehensive; tuples or nested data structures in
    # a module definition will defeat it.

    # Current implementation would throw an error if you tried to call a(x, y)
    # since Python would expect a(x);  OpenSCAD itself ignores extra arguments,
    # but that's not really preferable behavior

    # TODO:  write a pyparsing grammar for OpenSCAD, or, even better, use the yacc parse grammar
    # used by the language itself.  -ETJ 06 Feb 2011

    no_comments_re = r'(?mxs)(//.*?\n|/\*.*?\*/)'

    # Also note: this accepts: 'module x(arg) =' and 'function y(arg) {', both
    # of which are incorrect syntax
    mod_re = r'(?mxs)^\s*(?:module|function)\s+(?P<callable_name>\w+)\s*\((?P<all_args>.*?)\)\s*(?:{|=)'

    # See https://github.com/SolidCode/SolidPython/issues/95; Thanks to https://github.com/Torlos
    args_re = r'(?mxs)(?P<arg_name>\w+)(?:\s*=\s*(?P<default_val>([\w.\"\s\?:\-+\\\/*]+|\((?>[^()]|(?2))*\)|\[(?>[^\[\]]|(?2))*\])+))?(?:,|$)'

    # remove all comments from SCAD code
    scad_code_str = re.sub(no_comments_re, '', scad_code_str)
    # get all SCAD callables
    mod_matches = re.finditer(mod_re, scad_code_str)

    for m in mod_matches:
        callable_name = m.group('callable_name')
        args = []
        kwargs = []
        all_args = m.group('all_args')
        if all_args:
            arg_matches = re.finditer(args_re, all_args)
            for am in arg_matches:
                arg_name = am.group('arg_name')
                if am.group('default_val'):
                    kwargs.append(arg_name)
                else:
                    args.append(arg_name)

        callables.append({'name': callable_name, 'args': args, 'kwargs': kwargs})

    return callables
Esempio n. 26
0
def pep_end(pep,seq):
	if len(findall(pep,seq)) > 1:
		runs = finditer(pep,seq)
		coord = []
		for match in runs:
			coord.append(match.end())
		return coord
	elif len(findall(pep,seq)) == 1:
		return search(pep,seq).end()
	else: return 'Not found'
Esempio n. 27
0
def find_out_regex(pattern, lines):
    end = len(lines)
    for match in regex.finditer(pattern, lines, regex.REVERSE):
        out = lines[match.end():end].rstrip('\n')
        if '\n' in out:
            return out
        else:
            end = match.start()
            # Skipping command
    raise Exception('No output to copy')
Esempio n. 28
0
def write_index(seqname, seq, strings, index_file):
	"""
	Finds and then writes the indexes of the substrings
	"""
	if len(seq) > 1:
		tups = []
		for string in strings:
			positions = [m.start() for m in re.finditer(string, seq.upper(), overlapped=True)]
			positions_rev = [m.end()-1 for m in re.finditer(reverse_complement(string), seq.upper(), overlapped=True)]
			tups += zip(positions, ["+"] * len(positions), [string] * len(positions))
			tups += zip(positions_rev, ["-"] * len(positions_rev),  [string] * len(positions_rev))
			print("indexed " + str(len(positions) + len(positions_rev)) + " " + string + " sites")

		# Sort by position
		tups = sorted(tups, key=lambda x: x[0])

		# print(tups)
		for t in tups:
			index_file.write(seqname + "\t" + str(t[0]) + "\t" + t[1] + "\t" + t[2] + "\n")
Esempio n. 29
0
def parse_sequence(json_str):
	re_item = r'(?P<value>' + '|'.join(
		(re_bool,
		 re_float,
		 re_int,
		 re_none,
		 re_object,
		 re_sequence,
		 re_str)) + ')'
	items = regex.finditer(re_item, json_str[1:-1])
	return [from_json(i.group('value')) for i in items]
Esempio n. 30
0
 def get_start_end_indexes(seq, seq_file):
     ''' Find a given string in a given file and get the indexes of said
         string in the file
     '''
     find = [(m.start(0), m.end(0)) for m in regex.finditer(r'(?:'+seq+'){s<=2}'
                                                            , seq_file)][0]
     start = find[0]
     end = find[1]
     matched_seq = seq_file[start:end]
     
     return (start, end, matched_seq)
Esempio n. 31
0
    def finditer(pattern,
                 string,
                 flags=0,
                 pos=None,
                 endpos=None,
                 overlapped=False,
                 partial=False,
                 concurrent=None,
                 **kwargs):
        """Wrapper for finditer."""

        return regex.finditer(_apply_search_backrefs(pattern, flags), string,
                              flags, pos, endpos, overlapped, partial,
                              concurrent, **kwargs)
Esempio n. 32
0
def search_motif(protein, motif):
    """Search for a motif in a protein.

    Args:
        protein: sequence of amino acids
        motif:

    Return:
        a list of indexes of the motif's locations within protein
    """
    motif_regex = motif.replace("{", "[^").replace("}", "]")
    indexes = re.finditer(motif_regex, protein, overlapped=True)

    return [i.start() + 1 for i in indexes]
def find_gap_positions_and_matching_characters(
        gap_string: str, character_string: str) -> List[Operation]:
    assert len(gap_string) == len(character_string)
    positions = [m.start() for m in re.finditer('<', gap_string)]
    grouped_positions = group_consecutive_indexes(positions)
    operations = []
    for group in grouped_positions:
        if len(group) == 0:
            continue
        characters = ""
        for index in group:
            characters += character_string[index]
        operations.append(Operation(group[0], characters))
    return operations
Esempio n. 34
0
 def __init__(self, match, parent, parent_scan_position):
     NObject.__init__(self, match.start() + parent_scan_position, parent)
     # process condition. A non-boolean value will be true if different from null.
     self.if_condition = self.scan_expression(
         match.group('IF_CONDITION'),
         parent_scan_position + match.start('IF_CONDITION'))
     # process if_body
     self.if_body = []
     for m in re.finditer(full_regex, match.group('IF_BODY')):
         if self.is_match_valid(m, parent_scan_position):
             self.if_body.append(
                 self.scan_fun_body(
                     m, parent_scan_position + match.start('IF_BODY')))
     # process else_body
     self.else_body = []
     if match.group('ELSE_BODY') != None:
         for m in re.finditer(full_regex, match.group('ELSE_BODY')):
             if self.is_match_valid(
                     m, parent_scan_position + match.start('ELSE_BODY')):
                 self.else_body.append(
                     self.scan_fun_body(
                         m,
                         parent_scan_position + match.start('ELSE_BODY')))
Esempio n. 35
0
def parse_footnotes(s):
    footnote_pattern = r'(?<!\d\s*)\\(?P<i>\d+)\\'
    # Parse footnotes
    footnote_paragraph_match = re.match(footnote_pattern, s)
    if footnote_paragraph_match:
        footnote = footnote_paragraph_match.group('i')
        footnotes = []
    else:
        footnote = np.nan
        footnotes = [m.group('i') for m in re.finditer(footnote_pattern, s)]

    s = re.sub(footnote_pattern, '', s)

    return footnote, footnotes, s
Esempio n. 36
0
def explicit_and(line, listAnd):
    """
    Parse string of type "course code and course code and course code ..." where course code 
    is used interchangbly with placeholder #ids

    Attributes
    ---------
    line: str
        pre-requisite string
    listAnd: str
        used in regex expression to check if course codes are explicitly joined by "and"
    
    Returns
    ----------
    matchObj: dict
        dictionary with id as key and parsed json array as value as 
        {'id': [{
                'type': 'AND',
                'groups': [(str) code, (str) code, ...]
            }]
        }
    line: str
        modified pre-requisite string with group replaced by #id
    """

    matchObj = {}
    matches = regex.finditer(listAnd, line)
    if matches != None:
        for x in matches:
            spl = regex.split(r' and |, ', x.group())

            codes = []

            for y in spl:
                codex = y
                if codex in matchObj:
                    codex = matchObj[codex]
                else:
                    codex = [codex]

                codes += codex

            obj = {'type': 'AND', 'groups': codes}

            id = genId()
            matchObj[id] = [obj]
            line = line.replace(
                x.group(), id)  # replace group substring with placeholder id

    return matchObj, line
Esempio n. 37
0
    def perform_camel_case_splitting(self):
        '''
		Convert all camelcase terms into individual terms
		ret1: processed content without any camelcase terms
		'''

        content = self.current_content
        # self.camel_case_split_content = regex.sub(r'([a-z]*)([A-Z])', r'\1 \2', content)
        matches = regex.finditer(
            '.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', content,
            regex.DOTALL)
        self.camel_case_split_content = " ".join([m.group(0) for m in matches])
        self.current_content = self.camel_case_split_content
        return self.camel_case_split_content
Esempio n. 38
0
    def try_merge_modifier_token(self,
                                 extract_result: ExtractResult,
                                 pattern: Pattern,
                                 source: str,
                                 potential_ambiguity: bool = False) -> bool:
        before_str = source[0:extract_result.start]
        after_str = source[extract_result.start:extract_result.length]

        # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod
        if potential_ambiguity and self.config.ambiguous_range_modifier_prefix and \
                regex.search(self.config.ambiguous_range_modifier_prefix, before_str):
            matches = list(
                regex.finditer(self.config.potential_ambiguous_range_regex,
                               source))
            if matches and len(matches):
                return any(match.start() < extract_result.start +
                           extract_result.length
                           and match.end() > extract_result.start
                           for match in matches)
                # return self._filter_item(extract_result, matches)

        token = self.has_token_index(before_str.strip(), pattern)
        if token.matched:
            mod_len = len(before_str) - token.index
            extract_result.length += mod_len
            extract_result.start -= mod_len
            extract_result.text = source[extract_result.
                                         start:extract_result.start +
                                         extract_result.length]

            extract_result.meta_data = self.assign_mod_metadata(
                extract_result.meta_data)
            return True
        elif self.config.check_both_before_after:
            # check also after_str
            after_str = source[extract_result.start:extract_result.length]
            token = self.has_token_index(after_str.strip(), pattern)
            if token.matched:
                mod_len = token.index + len(after_str) - len(after_str.strip())
                extract_result.length += mod_len
                extract_result.text = source[extract_result.
                                             start:extract_result.start +
                                             extract_result.length]
                extract_result.data = Constants.HAS_MOD
                extract_result.meta_data = self.assign_mod_metadata(
                    extract_result.meta_data)

                return True

        return False
Esempio n. 39
0
def get_range_conditions(range_string):
    for matches in regex.finditer(r"(?:([0-9]*)[:\-.]([0-9]*)|([0-9]+))", range_string):
        start, end, singular = matches.groups()
        if start and end and int(start) > int(end):
            start, end = end, start
        yield (lambda x, s=singular: int(s) == x) if singular else (
            lambda x: True
        ) if not (start or end) else (
            lambda x, s=start: x >= int(s)
        ) if start and not end else (
            lambda x, e=end: x <= int(e)
        ) if not start and end else (
            lambda x, s=start, e=end: int(s) <= x <= int(e)
        )
def sort_by_barcode(list_sParameters):

    print('Processing %s' % list_sParameters[1])

    sSplitTag = list_sParameters[0]
    sInFastq = list_sParameters[1]
    sTempOut = list_sParameters[2]
    sBarcodeFile = list_sParameters[3]
    sRE = '[T]{7}'
    nBarcode3Cut = 3
    nBarcode5Ext = 18
    dict_sBarcodes = load_PE_input(sBarcodeFile)

    dict_sOutput = {}
    InFile = open(sInFastq, 'r')
    for i, sReadLine in enumerate(InFile):

        if i % 4 == 0: sReadID = sReadLine.replace('\n', '')
        if i % 4 != 1: continue

        sNGSSeq = sReadLine.replace('\n', '').upper()

        for sReIndex in regex.finditer(sRE, sNGSSeq, overlapped=True):
            nIndexStart = sReIndex.start()
            nIndexEnd = sReIndex.end()
            sBarcode = sNGSSeq[nIndexStart + nBarcode3Cut:nIndexEnd +
                               nBarcode5Ext]

            if nIndexStart > (len(sNGSSeq) / 2):
                continue  # SKIP barcode in back of read

            ### Skip Non-barcodes ###
            try:
                cPE = dict_sBarcodes[sBarcode]
            except KeyError:
                continue
            #########################

            if sBarcode not in dict_sOutput:
                dict_sOutput[sBarcode] = []
            dict_sOutput[sBarcode].append([sReadID, sNGSSeq])

        #loop END: i, sReadLine
    #loop END: cPE
    InFile.close()
    ## Pickle Out ##
    sOutFile = '%s/%s.data' % (sTempOut, sSplitTag)
    OutFile = open(sOutFile, 'wb')
    pickle.dump(dict_sOutput, OutFile)
    OutFile.close()
Esempio n. 41
0
def _get_locations(symbol_name, text):
    locations = []
    for sentence in finditer(r'(^|[\.\!\?])[\s]*([^\.\!\?]+([\.\!\?]|$))',
                             text,
                             overlapped=True):
        existences = _locator(symbol_name, sentence.group(2))
        if len(existences) == 0:
            continue
        start = sentence.start(2)
        end_shift = sentence.end(2) - start
        sent_locations = [(start, ex[0], ex[1] - ex[0], end_shift)
                          for ex in existences]
        locations += sent_locations
    return locations
def find_motif(dna: str, motif: str, re: bool) -> list:
    locs = []
    # Случай, когда мотив задан просто сиквенсом, а не регулярным выражением
    if re == False:
        ml = len(motif)
        for i in range(len(dna)):
            if dna[i] == motif[0]:
                if i + ml < len(dna) and dna[i: i + ml] == motif:
                    locs.append(i)
    else:
        motif = regex.compile(motif)
        for match in regex.finditer(motif, dna, overlapped=True):
            locs.append(match.start())
    return (locs)
Esempio n. 43
0
    def annotate(self, doc):
        doc = doc.upper().replace('\r\n', '\n')

        #negation detection works better with newlines removed
        doc = doc.replace('\n', ' ')
        doc = doc.replace('\r', ' ')
        doc = regex.sub(r'  +', ' ', doc)
        doc = doc.replace(" .", ".")
        doc = regex.sub(r'\.+', ".", doc)

        doc_data = {'mentions': []}

        for dr in self.all_drugs:
            #{'mentioned':False, 'negated':False, 'status': False}
            reg = self.positive_regex[dr]
            m = regex.finditer(reg.pattern, doc)
            for match in m:
                # each time the drug name is found, test for negation
                mention = {
                    'drug': dr,
                    'mentioned': True,
                    'negated': False,
                    'allergic': False
                }
                mention.update({x: False for x in self.user_regex})
                # get context and search within it for negation
                ctx_from = max(match.span()[0] - self.review_window, 0)
                ctx_to = min(match.span()[1] + self.review_window, len(doc))
                ctx = doc[ctx_from:ctx_to]
                mention['ctx'] = ctx
                mention['start'] = match.span()[0]
                mention['end'] = match.span()[1]

                ctx_from = max(match.span()[0] - self.negation_window, 0)
                ctx_to = min(match.span()[1] + self.negation_window, len(doc))
                ctx = doc[ctx_from:ctx_to]

                for negator in self.negative_regex[dr]:
                    if len(negator.findall(ctx)) > 0:
                        mention['negated'] = True
                for name in self.user_regex:
                    for pattern in self.user_regex[name]:
                        if len(pattern.findall(ctx)) > 0:
                            mention[name] = True
                al = self.detect_allergy(ctx, dr)
                mention['allergic'] = al

                doc_data['mentions'].append(mention)

        return doc_data
Esempio n. 44
0
    def basic_regex_match(self, source: str) -> []:
        from .utilities import Token
        result: List[Token] = list()

        for pattern in self.config.time_regex_list:
            matches = list(regex.finditer(pattern, source))

            # @TODO Workaround to avoid incorrect partial-only matches. Remove after time regex reviews across languages.
            matches = list(filter(lambda match: self.lth_check(match),
                                  matches))

            result.extend(map(lambda x: Token(x.start(), x.end()), matches))

        return result
Esempio n. 45
0
    def __analyze_patterns(self, text: str, flags: int = None):
        """
        Evaluate all patterns in the provided text.

        Logic includes detecting words in the provided deny list.
        In a sentence we could get a false positive at the end of our regex, were we
        want to find the IBAN but not the false positive at the end of the match.

        i.e. "I want my deposit in DE89370400440532013000 2 days from today."

        :param text: text to analyze
        :param flags: regex flags
        :return: A list of RecognizerResult
        """
        results = []
        for pattern in self.patterns:
            matches = re.finditer(pattern.regex, text, flags=self.flags)

            for match in matches:
                for grp_num in reversed(range(1, len(match.groups()) + 1)):
                    start = match.span(0)[0]
                    end = (match.span(grp_num)[1]
                           if match.span(grp_num)[1] > 0 else match.span(0)[1])
                    current_match = text[start:end]

                    # Skip empty results
                    if current_match == "":
                        continue

                    score = pattern.score

                    validation_result = self.validate_result(current_match)
                    description = PatternRecognizer.build_regex_explanation(
                        self.name, pattern.name, pattern.regex, score,
                        validation_result)
                    pattern_result = RecognizerResult(
                        self.supported_entities[0], start, end, score,
                        description)

                    if validation_result is not None:
                        if validation_result:
                            pattern_result.score = EntityRecognizer.MAX_SCORE
                        else:
                            pattern_result.score = EntityRecognizer.MIN_SCORE

                    if pattern_result.score > EntityRecognizer.MIN_SCORE:
                        results.append(pattern_result)
                        break

        return results
Esempio n. 46
0
    def hightlight_keywords(self,
                            text,
                            keywords,
                            light_color='#ffea593d',
                            deep_color='#ffc107'):
        hightlighted_html = ''
        all_hightlights = []
        tokens = self.full_tokenize(text)
        for t in tokens:
            t['background_color'] = []

        for w in keywords:
            matches = regex.finditer(
                r'\b{}\b'.format(regex.escape(w, special_only=True)),
                text,
                flags=regex.IGNORECASE
            )
            all_hightlights.extend([
                {
                    'start': m.start(),
                    'end': m.end(),
                    'text': m.group(),
                }
                for m in matches
            ])

        all_hightlights = sorted(all_hightlights, key=lambda x: x['start'])

        for h in all_hightlights:
            for t in tokens:
                if (t['start'] >= h['start'] and t['end'] <= h['end']):
                    t['background_color'].append(light_color)

        for t in tokens:
            color_len = len(t['background_color'])
            if color_len == 0:
                hightlighted_html += t['text']
            elif color_len == 1:
                hightlighted_html += \
                    '<span style="background-color:{background_color};">{text}</span>'.format(
                    background_color=t['background_color'][0],
                    text=t['text']
                )
            else:
                hightlighted_html += \
                    '<span style="background-color:{background_color};">{text}</span>'.format(
                    background_color=deep_color,
                    text=t['text']
                )
        return hightlighted_html
Esempio n. 47
0
def match_utterance(user_input):
    """
    match_utterance accepts the user's utterance and
    tries to find a template matching it
    """
    for key in dialogue_pairs:
        matches = list(re.finditer(key, user_input))
        if matches:
            choice = np.random.randint(len(dialogue_pairs[key]))
            answer = re.sub(key,
                            dialogue_pairs[key][choice],
                            user_input,
                            count=1)
            return answer
Esempio n. 48
0
def split_into_words(token: str) -> List[ParsedToken]:
    """
    >>> split_into_words("    var = 9.4\\t\\n")
    [<Tab>, SplitContainer[Word(('var', none))], NonCodeChar(=), <Number>(9), \
NonCodeChar(.), <Number>(4), <Tab>, <NewLine>]
    """
    res = []
    four_char_whitespace = " " * 4
    for m in regex.finditer(f"(\\w+|[^ ]|{four_char_whitespace})", token):
        if m[0] == four_char_whitespace:
            res.append(Tab())
        else:
            res.append(to_parsed_token(m[0]))
    return res
    def _calculate_representation_original_array(self):
        original_interval_start = max(0, self.list_repeats_starts[0] -
                                      5)  # flanks for regex
        original_interval_end = self.list_repeats_starts[-1] + len(
            self.list_repeats[-1]) + 5  # flanks for regex

        original_dna_interval = self.full_dna[
            original_interval_start:original_interval_end]
        search_pattern = f"(?e)({self.repeat_seq_candidate})" + "{" + "i<=3,d<=3,s<=3,s+i+d<=6" + "}"

        original_matches = list(
            regex.finditer(search_pattern, original_dna_interval))

        self.list_repeats_original = [
            match.group() for match in original_matches
        ]

        self.list_repeats_starts_original = [
            original_interval_start + match.start()
            for match in original_matches
        ]

        if len(original_matches) > 1:
            internal_spacer_starts = [
                match.end() for match in original_matches
            ][:-1]
            internal_spacer_end = [
                match.start() for match in original_matches
            ][1:]
            internal_spacer_coordinates = [(start, end) for start, end in zip(
                internal_spacer_starts, internal_spacer_end)]
            self.list_spacers_original = [
                original_dna_interval[start_end[0]:start_end[1]]
                for start_end in internal_spacer_coordinates
            ]
        else:
            self.list_spacers_original = []

        relative_error_indexes = []
        for match in original_matches:
            tuple_match_errors = match.fuzzy_changes
            list_relative_errors = [[e - match.start() for e in err_type]
                                    for err_type in tuple_match_errors]
            relative_error_indexes.append(list_relative_errors)

        self.list_repeats_error_indexes_original = relative_error_indexes

        self.list_fuzzy_counts_original = [
            match.fuzzy_counts for match in original_matches
        ]
Esempio n. 50
0
def format_internal_header_links(document, notes):
    """Formats Obsidian style header links"""
    matches = re.finditer("\\[{2}([^|#\\]]*?)#(.*?)\\]{2}", document)

    for match in matches:
        text = match.group(2)
        link = slug_case(match.group(1)) + ".html#" + slug_case(match.group(2))
        files = [note['filename'] for note in notes]
        if match.group(1) in files:
            document = document.replace(match.group(), md_link(text, link))
        else:
            document = document.replace(match.group(), text)

    return document
Esempio n. 51
0
 def xp_single_sub_check(self, sub, regex, error_type, color, info_text,
                         reflags):
     res = re.finditer(regex,
                       sub.text,
                       flags=re.U
                       | reflags if reflags != None else re.UNICODE)
     res_list = []
     for item in res:
         res_list.append((item.start(), item.end()))
     if len(res_list) > 0:
         #sub.info = (error_type, (u'<span foreground="'+color+'">' + info_text + u'</span>', res_list))
         sub.info = (error_type, (info_text, res_list))
     else:
         sub.info = (error_type, '')
Esempio n. 52
0
    def get_label_masks(self,
                        label,
                        source_pattern,
                        translation,
                        source,
                        target: Optional[str] = None) -> Tuple[str, str, Dict]:
        """
        Search for `source_pattern` in `source`.
        If `target` is None, replace the matched text with `label`.
        If `target` is not None, replace the matched text with `label` only if `translation` can be found in `target`.
        If `translation` is None, the matched text is used to match against `target` instead.
        """
        masks = []
        source_mods = []
        target_mods = []
        for source_match in re.finditer(source_pattern, source):
            matched_text = source_match.group()
            source_match_position = source_match.start()
            replacement_text = translation if translation is not None else matched_text
            loc_in_target = -1 if target is None else target.find(replacement_text)

            if target is None or loc_in_target != -1:
                self.counts[label] += 1
                labelstr = self.get_mask_string(label, self.counts[label])

                # Run the regex again so we make sure to get the exact place
                # (str.replace here would be quicker but may find a match in the sentence
                #  not exactly like the matched regex)
                source_mods.append((source_match_position, len(matched_text), labelstr))
                if target is not None:
                    target_mods.append((loc_in_target, len(replacement_text), labelstr))

                mask = { "maskstr" : labelstr.strip(), "matched" : matched_text, "replacement" : replacement_text }
                masks.append(mask)
            else:
                self.counts_missed[label] += 1

        def apply_mods(text, mod_list):
            if mod_list:
                offset = 0
                for i, matched_len, mask in mod_list:
                    i += offset
                    text = text[0:i] + mask + text[i + matched_len:]
                    offset += len(mask) - matched_len
            return text

        source = apply_mods(source, source_mods)
        target = apply_mods(target, target_mods)

        return source, target, masks
Esempio n. 53
0
def plot_location(needle, haystack, nbins=20, size=(17, 2)):
    """plot_location."""
    locs = []
    for h, s in haystack:
        for match in re.finditer(needle, s):
            s = match.start()
            e = match.end()
            m = s + (e - s) / 2
            locs.append(m)
    plt.figure(figsize=size)
    n, bins, patches = plt.hist(
        locs, nbins, normed=0, facecolor='blue', alpha=0.3)
    plt.grid()
    plt.show()
Esempio n. 54
0
def match(original_text, word_or_token_list_to_match, clean_text=None):
    '''See README.md for a description of how to use this function.'''

    regex_flags = re.U | re.I

    if len(word_or_token_list_to_match) == 0:
        return []

    if not (clean_text):
        clean_text = _cleanup_text(original_text)

    if type(word_or_token_list_to_match) is list:
        to_match = untokenize(" ".join(word_or_token_list_to_match).strip())
        matches = [
            (m.start(), m.end(), original_text[m.start():m.end()])
            for m in re.finditer(re.escape(to_match), clean_text, regex_flags)
        ]
        if len(matches) == 0:
            matches = [(m.start(), m.end(), original_text[m.start():m.end()])
                       for m in re.finditer(
                           r'\s*'.join(
                               re.escape(w)
                               for w in word_or_token_list_to_match),
                           original_text, regex_flags)]
            if len(matches) == 0:
                edit_distance_match = _match_by_edit_distance(
                    clean_text, re.sub(r'\\s[\*\+]', r' ', to_match))
                matches = [(m.start(), m.end(),
                            original_text[m.start():m.end()])
                           for m in re.finditer(re.escape(edit_distance_match),
                                                clean_text, regex_flags)]
                if len(matches) == 0:
                    edit_distance_match = _match_by_edit_distance(
                        original_text, re.sub(r'\\s[\*\+]', r' ', to_match))
                    matches = [
                        (m.start(), m.end(), original_text[m.start():m.end()])
                        for m in re.finditer(re.escape(edit_distance_match),
                                             original_text, regex_flags)
                    ]
                    if len(matches) == 0:
                        edit_distance_match = _match_by_edit_distance(
                            original_text,
                            " ".join(word_or_token_list_to_match))
                        matches = [(m.start(), m.end(),
                                    original_text[m.start():m.end()])
                                   for m in re.finditer(
                                       re.escape(edit_distance_match),
                                       original_text, regex_flags)]
                        if len(matches) == 0:
                            return []
    else:
        matches = [(m.start(), m.end(), original_text[m.start():m.end()])
                   for m in re.finditer(
                       r'\b' + re.escape(word_or_token_list_to_match) +
                       r'\b', clean_text, regex_flags)]

    return sorted(matches)
Esempio n. 55
0
def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
    """Run regexes against message's marked lines to strip quotations.

    Return only last message lines.
    >>> mark_message_lines(['Hello', 'From: [email protected]', '', '> Hi', 'tsem'])
    ['Hello']

    Also returns return_flags.
    return_flags = [were_lines_deleted, first_deleted_line,
                    last_deleted_line]
    """
    markers = ''.join(markers)
    # if there are no splitter there should be no markers
    if 's' not in markers and not re.search('(me*){3}', markers):
        markers = markers.replace('m', 't')

    if re.match('[te]*f', markers):
        return_flags[:] = [False, -1, -1]
        return lines

    # inlined reply
    # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
    # both 't' entries should be found
    for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
        # long links could break sequence of quotation lines but they shouldn't
        # be considered an inline reply
        links = (RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1])
                 or RE_PARENTHESIS_LINK.match(
                     lines[inline_reply.start()].strip()))
        if not links:
            return_flags[:] = [False, -1, -1]
            return lines

    # cut out text lines coming after splitter if there are no markers there
    quotation = re.search('(se*)+((t|f)+e*)+', markers)
    if quotation:
        return_flags[:] = [True, quotation.start(), len(lines)]
        return lines[:quotation.start()]

    # handle the case with markers
    quotation = (RE_QUOTATION.search(markers)
                 or RE_EMPTY_QUOTATION.search(markers))

    if quotation:
        return_flags[:] = True, quotation.start(1), quotation.end(1)
        return lines[:quotation.start(1)] + lines[quotation.end(1):]

    return_flags[:] = [False, -1, -1]
    return lines
Esempio n. 56
0
    def match_simple_cases(self, regexp: Pattern, source: str) -> List[Token]:
        result: List[Token] = list()

        for regexp in self.config.simple_cases_regex:
            matches: [Match] = regex.finditer(regexp, source)

            if matches:
                for match in matches:

                    if RegExpUtility.get_group(match, Constants.MINUTE_GROUP_NAME) or\
                            RegExpUtility.get_group(match, Constants.SECOND_GROUP_NAME):

                        end_with_valid_token = True
                        if (source.index(match.group()) + (match.end() - match.start())) == len(source):
                            end_with_valid_token = True

                        else:
                            after_str = source[source.index(match.group()) + (match.end() - match.start())]

                            end_with_general_endings = self.config.general_ending_regex.match(after_str)
                            end_with_am_pm = RegExpUtility.get_group(match, Constants.RIGHT_AM_PM_GROUP_NAME)

                            if end_with_general_endings or end_with_am_pm or\
                                    after_str.lstrip().startswith(self.config.token_before_date):
                                end_with_valid_token = True
                            elif (self.config.options & DateTimeOptions.ENABLE_PREVIEW) != 0:
                                # When TimeZone be migrated enable it
                                end_with_valid_token = False

                        if end_with_valid_token:
                            result.append(Token(source.index(match.group()), source.index(match.group()) +
                                                (match.end() - match.start())))
                    else:
                        match_pm_str = RegExpUtility.get_group(match, Constants.PM_GROUP_NAME)
                        match_am_str = RegExpUtility.get_group(match, Constants.AM_GROUP_NAME)
                        desc_str = RegExpUtility.get_group(match, Constants.DESC_GROUP_NAME)

                        if match_pm_str or match_am_str or desc_str:
                            result.append(Token(source.index(match.group()), source.index(match.group()) +
                                                (match.end() - match.start())))
                        else:
                            after_str = source[source.index(match.group()) + (match.end() - match.start()):]

                            # When TimeZone be migrated enable it
                            if (self.config.options & DateTimeOptions.ENABLE_PREVIEW) != 0:
                                result.append(Token(source.index(match.group()),
                                                    source.index(match.group()) + (match.end() - match.start())))

        return result
Esempio n. 57
0
    def detect_reference_start(self):

        self.references_start_index = 0
        pdftext = self.get_fulltext()  #str(self.pdf)
        reference_regexes = [
            r"\breferences\b",  # Title: References
            r"\bliterature cited\b"
        ]  # Title: Literature cited
        potential_reference_starts = []

        for title_wording in reference_regexes:
            potential_reference_starts += [
                r.end()
                for r in re.finditer(title_wording, pdftext, re.IGNORECASE)
            ]

        print("Found these starts of potential reference section")
        for ref in potential_reference_starts:
            print(f"MATCH: {pdftext[ref-1:ref+20]}... at location {ref}")

        print(
            f"There are {len(potential_reference_starts)} potential reference start points"
        )

        if len(potential_reference_starts) == 0:
            print("Could not detect references start point.")
        elif len(potential_reference_starts) == 1:
            self.references_start_index = potential_reference_starts[0]
        else:
            print(
                "There were more than one probable reference section. Detecting the most probable one."
            )
            most_probable_start_point = 0
            most_year_mentions = 0
            # Test of correct reference point is based on how many year numbers are followed by the keyword
            # As it is more likely for references to be in the end of the self (although not always the case)
            # The number of year numbers is multiplied by index of the finding...
            for index, test_start in enumerate(potential_reference_starts):
                test_text = pdftext[test_start:test_start + 1000]
                nr_year_mentions = (index + 1) * len(
                    re.findall(r"[\s\(\.,;]((?:19|20)\d\d)[\s\)\.,;]",
                               test_text))
                print(
                    f"Position {test_start} is followed by {nr_year_mentions} year mentions."
                )
                if nr_year_mentions > most_year_mentions:
                    most_probable_start_point = test_start
                    most_year_mentions = nr_year_mentions
            self.references_start_index = most_probable_start_point
Esempio n. 58
0
    def extract(self,
                source: str,
                reference: datetime = None) -> List[ExtractResult]:

        if reference is None:
            reference = datetime.now()

        result: List[ExtractResult] = list()
        if not source:
            return result

        match_source: Dict[Match, any] = dict()
        matched: List[bool] = [False] * len(source)

        collections = list(
            map(lambda x: (list(regex.finditer(x[0], source)), x[1]),
                self._regex_dict.items()))
        collections = list(filter(lambda x: len(x[0]) > 0, collections))

        for collection in collections:
            for match in collection[0]:
                for j in range(len(match.group())):
                    matched[match.start() + j] = True
                match_source[match] = collection[1]

        last = -1
        for i in range(len(source)):
            if matched[i]:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    text = source[start:start + length].strip()
                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)
                    if src_match:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = text
                        value.type = self.extractor_type_name
                        value.data = self.__get_data(match_source, src_match)
                        result.append(value)
            else:
                last = i

        return result
Esempio n. 59
0
def valueRe(value):
    '''CHECKES WHETHER A VALUE HAS ANY TRAILING SPACES (TEST FURTHER)'''
    # I don't know what I'm doing

    # 1. Define regex expression
    r = '^[-\w/#]*|[-\w/#]*$'

    # 2. Find the match in the value string
    for x in regex.finditer(r, value):
        if x.group(0) is not '':
            return x.group(0)
    else:
        # Returns the value itself if for any reason the regex search does not work
        # Reasons being, for example, that the idiot programmer didn't test against corner cases
        return value
Esempio n. 60
0
def init_mapper(reads, strand):
    """
    Basic mapping function, based on regex, finds a match in a genome 
    and retrieves the location of insertion within the genome.
    
    reads - a list of queries 
    strand - the genome sequence as a string (posstive of negative)
    """
    found = list()
    for read in reads:
        for match in re.finditer(read, strand):
            info = '%02d-%02d: %s' % (match.start(), match.end(),
                                      match.group(0))
            found.append((read, info))
    return found  # the hits found by mapping