Example #1
0
 def test_hash_regex(self):
     pat = regex.Regex("foo")
     hash1 = md5(datasets.utils.dumps(pat)).hexdigest()
     pat = regex.Regex("bar")
     hash2 = md5(datasets.utils.dumps(pat)).hexdigest()
     pat = regex.Regex("foo")
     hash3 = md5(datasets.utils.dumps(pat)).hexdigest()
     self.assertEqual(hash1, hash3)
     self.assertNotEqual(hash1, hash2)
Example #2
0
def get_people_winner(tweets,award_names):
    nlp = spacy.load('en')
    stop_words = set(stopwords.words('english')) 
    reg = regex.Regex()
    results={}
    for movie in award_names:
    #     print(movie)
        if movie in reg.people_award:
    #         if movie != 'best performance by an actor in a supporting role in a motion picture':
    #             continue
            search_term = reg.getRegex(movie)
            
            word_size = 2
    #         print(search_term)
            result = []
            for tweet in tweets:  
                text = tweet['text']
                if 'RT' not in tweet['text']:
    #                 text = text.lower()
                    if re.search(search_term, text):
    #                     print(text)
                        winner = extract_award(text)
                        if winner:
                            result.append(winner)
    #                         print(winner)
            
    #         print(Counter(result).most_common())
            name = ""
            for res in Counter(result).most_common():
                name = res[0]
                if get_names(name,nlp) or validate_name(name):
                    break
            results[movie] = name
    return results
Example #3
0
    def read_from_file(self, fp):
        self.size = int(fp.readline())
        nregex = self.size * 2 + 1
        for i in range(3):
            fp.readline()
            self.regex[i] = [
                regex.Regex(fp.readline().strip()) for j in range(nregex)
            ]

        self.grid = Grid(self.size)
Example #4
0
def main():
    pattern = '(foo(ba)?)*(bar)+'
    test = 'foofoobabarbar'

    r = regex.Regex(pattern)
    r2 = re.compile(pattern)

    print "Custom implementation matches:", "yes" if r.match(test) else "no"
    m = r2.match(test)
    match = m is not None and m.group(0) == test
    print "Re module implementation matches:", "yes" if match else "no"

    print "Custom implementation runs in {{:.{precision}}} microseconds".format(
        precision=3).format(benchmark_micros(lambda: r.match(test)))
    print "Re module implementation runs in {{:.{precision}}} microseconds".format(
        precision=3).format(benchmark_micros(lambda: r2.match(test)))
Example #5
0
def get_allAwards(tweets, year):
    reg = regex.Regex()
    search_terms = ['best', 'w[io]n', 'go(es)? to', 'went to', ':']
    award_names = []
    for tweet in tweets:
        text = tweet['text'].lower()
        if 'RT' not in tweet['text']:
            if re.search(search_terms[0],
                         text) and (re.search(search_terms[1], text)
                                    or re.search(search_terms[2], text)
                                    or re.search(search_terms[3], text)
                                    or re.search(search_terms[4], text)):
                extract_str = extract_award(text)
                if re.search('usa', extract_str):
                    continue
                if re.search('globe', extract_str):
                    continue
                if len(extract_str.split()) < 4:
                    continue
                count = Counter(extract_str)
                if count['-'] > 1:
                    continue
                award_names.append(extract_str)
    results = Counter(award_names).most_common()

    results = merge_awards(results[:200])
    # results = Counter(results).most_common()
    results = sorted(results, key=lambda x: x[1], reverse=True)
    results = filter_awards(results)
    results = sanitize(results[:100])
    results = final_sanitize(results)
    answers = []
    for i in range(len(results)):
        if i >= 26:
            break
        res = results[i]
        answers.append(res[0])
    return answers


# import json
# tweets = json.load(open("gg2015.json"))
# results = get_allAwards(tweets,2015)
# print(results)
Example #6
0
            obj.co_lnotab,
            obj.co_freevars,
            obj.co_cellvars,
        )
    pickler.save_reduce(CodeType, args, obj=obj)
    dill._dill.log.info("# Co")
    return


def copyfunc(func):
    return types.FunctionType(func.__code__, func.__globals__, func.__name__, func.__defaults__, func.__closure__)


try:
    import regex

    @pklregister(type(regex.Regex("", 0)))
    def _save_regex(pickler, obj):
        dill._dill.log.info("Re: %s" % obj)
        args = (
            obj.pattern,
            obj.flags,
        )
        pickler.save_reduce(regex.compile, args, obj=obj)
        dill._dill.log.info("# Re")
        return


except ImportError:
    pass
Example #7
0
    def get_regex_from_pattern(self, patterns, patternid):
        ret = None
        pattern = self.get_pattern_from_id(patternid)

        if pattern:
            ret = self.regexs.get(patternid, None)
            if ret is None:
                if 'error' in pattern:
                    del pattern['error']
                ret = pattern['pattern']
                if ret:
                    # eg.  iii hidas et ii carrucas
                    # iiii hidis et i uirgata
                    # u hidis
                    # pro dimidia hida Hanc
                    # Ibi habet abbas ii hidas et dimidiam in dominio et ii carrucas et uillani dimidiam hidam
                    # hides:different units hid*: hida, uirgat*, ferdi*/ferlin*
                    # ? 47b1: et ui agris
                    # 41a2: iiii hidis et uirga et dimidia
                    #
                    # c bordarios x minus
                    # iiii libras et iii solidos i denarium minus
                    #

                    measurement = self.get_pattern_from_key(
                        'helper_measurement')

                    def replace_reference(match):
                        rep = match.group(0)
                        ref = match.group(1)
                        if ref == 'PATTERN':
                            return rep
                        # TODO: try other than helper_, not now as it might have
                        # accidental match
                        ref_pattern = self.get_pattern_from_key('helper_' +
                                                                ref)
                        if ref_pattern:
                            rep = ref_pattern['pattern']
                        else:
                            if measurement:
                                # try singular helper
                                if ref.endswith('s'):
                                    ref_pattern = self.get_pattern_from_key(
                                        ('helper_%s' % ref)[0:-1])
                                    if ref_pattern:
                                        # let's apply measurement to this
                                        rep = ref_pattern['pattern']
                                        rep = measurement['pattern'].replace(
                                            '<PATTERN>', rep)

                        if not ref_pattern:
                            pattern[
                                'error'] = 'Reference to an unknown pattern: <%s>. Check the spelling.' % ref

                        return rep

                    i = 0
                    while 'error' not in pattern:
                        i += 1
                        before = ret
                        ret = re.sub(ur'<([^>]+)>', replace_reference, ret)
                        if i > 100:
                            pattern[
                                'error'] = 'Detected circular references in the pattern. E.g. p1 = <p2>; p2 = <p1>.'
                            break
                        if ret == before:
                            break

                    # LOW LEVEL SYNTACTIC SUGAR
                    if not 'error' in pattern:
                        #  e.g. x (<number>)? y
                        while True:
                            ret2 = ret
                            ret = re.sub(ur'( |^)(\([^)]+\))\?( |$)',
                                         ur'(\1\2)?\3', ret2)
                            if ret == ret2:
                                break
                        # <person> habet <number> mansionem
                        ret = ret.replace(ur'%', ur'\w*')
                        # aliam = another
                        # unam = one
                        # dimidia = half
                        # duabus = two
                        ret = ret.replace(ur'7', ur'et')
                        if ret[0] not in [ur'\b', '^']:
                            ret = ur'\b' + ret
                        if not ret.endswith(ur'\b'):
                            ret = ret + ur'\b'
                        try:
                            ret = re.Regex(ret)
                        except Exception, e:
                            pattern['error'] = unicode(e)
                        finally:
Example #8
0
                    if not 'error' in pattern:
                        #  e.g. x (<number>)? y
                        while True:
                            ret2 = ret
                            ret = re.sub(ur'( |^)(\([^)]+\))\?( |$)',
                                         ur'(\1\2)?\3', ret2)
                            if ret == ret2:
                                break
                        # <person> habet <number> mansionem
                        ret = ret.replace(ur'%', ur'\w*')
                        # aliam = another
                        # unam = one
                        # dimidia = half
                        # duabus = two
                        ret = ret.replace(ur'7', ur'et')
                        if ret[0] not in [ur'\b', '^']:
                            ret = ur'\b' + ret
                        if not ret.endswith(ur'\b'):
                            ret = ret + ur'\b'
                        try:
                            ret = re.Regex(ret)
                        except Exception, e:
                            pattern['error'] = unicode(e)
                        finally:
                            self.regexs[patternid] = ret

                if 'error' in pattern:
                    ret = re.Regex('INVALID PATTERN')

        return ret
Example #9
0
def get_presenters(tweets):
    search_terms = [r'[Pp]resent']
    stop_terms = [r'[Rr]epresent']
    award_dict = reg = regex.Regex().award_dict
    gg_stop_words = ['Globe', 'RT', 'http', 'Golden', 'Globes', 'GoldenGlobes', 'Goldenglobes', 'Goldenglobe', 'gg','golden globes', 'golden globe', 'goldenglobe','goldenglobes','gg2015','gg15','goldenglobe2015','goldenglobe15','goldenglobes2015','goldenglobes15', 'gg2013','gg13','goldenglobe2013','goldenglobe13','goldenglobes2013','goldenglobes13', 'rt', '2013', '2015']
    awards = list(award_dict.values())
    clean_data = []
    for x in tweets:
        if 'RT' not in x['text']:
            clean_data.append(x)
    award_results = {}
    for x in clean_data:
        tweet = x["text"]
        for award_regex in awards:
            award = get_key(award_dict, award_regex)
            for search_term in search_terms:
                if re.search(search_term, tweet) and not re.search(stop_terms[0], tweet) and re.search(award_regex, tweet):
                    if award_results.get(award):
                        award_results[award].append(x['text'])
                    else:
                        award_results[award] = [x['text']]
                    break
    gg_stop_words = ['Globe', 'RT', 'http', 'Golden', 'Globes', 'GoldenGlobes', 'Goldenglobes', 'Goldenglobe', 'gg','golden globes', 'golden globe', 'goldenglobe','goldenglobes','gg2015','gg15','goldenglobe2015','goldenglobe15','goldenglobes2015','goldenglobes15', 'gg2013','gg13','goldenglobe2013','goldenglobe13','goldenglobes2013','goldenglobes13', 'rt', '2013', '2015', 'Best', 'BEST', 'Present', 'Presents', 'Angeles']
    final = {}
    proper = []
    for award in award_results.keys():
        final[award] = []
        proper_bi = []
        for tweet in award_results[award]:
            bigrams = list(nltk.bigrams(nltk.word_tokenize(tweet)))
            text = nltk.word_tokenize(tweet)
            tagged_text = nltk.pos_tag(text)
            ''' for single tokens:
            for token in tagged_text:
                if token[1] == "NNP" and token[0] not in gg_stop_words:
                    #print(token[0])
                    proper.append(token[0])
                    '''

            # tag double words with pos and pull out the two-proper-nouns-in-a-row
            for bigram in bigrams:
                tagged_text = nltk.pos_tag(bigram)
                if tagged_text[0][1] == "NNP" and tagged_text[0][0] not in gg_stop_words and tagged_text[1][1] == "NNP" and tagged_text[1][0] not in gg_stop_words:
                        proper_bi.append((tagged_text[0][0], tagged_text[1][0]))

        most_common = Counter(proper_bi).most_common()
        presenter_count = 3  # maximum of 3 presenters
        i = 0
        while len(most_common) > 1 and i < len(most_common)-1:
            # combine any 3-name sets among the most common, eg "Sacha Baron Cohen". >3 names is not accounted for.
            if(most_common[i][0][1] == most_common[i+1][0][0]):
                if presenter_count > 0:
                    final[award].append(most_common[i][0][0] + ' ' + most_common[i][0][1] + ' ' + most_common[i+1][0][1])
                    del most_common[i]
                    del most_common[i]
                    presenter_count = presenter_count - 1
            else:
                i = i + 1
        # fill in the rest of the most common up to the top 3 (seems like max # of presenters is 3)
        while presenter_count > 0 and len(most_common) != 0:
            presenter_count = presenter_count - 1
            final[award].append(most_common[0][0][0] + ' ' + most_common[0][0][1])
            del most_common[0]
            

    for award in award_dict.keys():
        if not award in final:
            final[award] = ["a", "e"]
            
#     print(final)
    return(final)
Example #10
0
import re
import sys

import utils
import regex

print("REGEX")
rgx = regex.Regex()
rgx.maxLength = 36
fileName = sys.argv[1]
#print(sys.argv[1])
if (len(sys.argv) == 3):
    rgx.util.parameters = sys.argv[2]
    #print(sys.argv[2])
#rgx.util.parameters = ''

lineList = [line.rstrip('\n') for line in open(fileName)]

#print('INPUT')

for i in range(len(lineList)):
    inputMap = rgx.regexStructure(lineList[i])
    if i == 0:
        mergeMap = inputMap
    else:
        mergeMap = rgx.merge(inputMap, mergeMap)

print(mergeMap)


def matchRegex():
Example #11
0
def get_film_winner(tweets,award_names):
    reg = regex.Regex()
    # print(reg.film_award)
    results={}
    for movie in award_names:
    #     print(movie)
        if movie in reg.film_award:
    #         if movie != 'best original song - motion picture':
    #             continue
            search_term = reg.getRegex(movie)
            
            word_size = 2
    #         print(search_term)
            result = []
            for tweet in tweets:
                
                text = tweet['text']
                if 'RT' not in tweet['text']:
                    if re.search(search_term, text):
    #                     print(text)
                        A = re.findall(r'“(.*?)”', text)
                        B = re.findall(r'"(.*?)"', text)
                            
                        if len(A)!=0:
                            for sentence in A:
                                if len(sentence.split())<10:
                                    result.append(sentence.lower())
                        if len(B)!=0:
                            for sentence in B:
                                if len(sentence.split())<10:
                                    result.append(sentence.lower())
            
            
            if len(result)==0:
                for tweet in tweets:
                    text = tweet['text']
                    if 'RT' not in tweet['text']:
                        if re.search(search_term, text):
                                C = re.findall(r'-(.*?)-', text)
                                if len(C)!=0:
    #                                 print(text)
                                    for sentence in C:
                                        if len(sentence.split())<5:
                                            result.append(sentence.strip())
                

    #         print(Counter(result))
            name = ""
            for res in Counter(result).most_common():
                name = res[0]
                if validate_film(name):
                    name = res[0]
                    break
    #         name = Counter(result).most_common()[0][0]
            results[movie] = name
    return results          
    # for key in results.keys():
    #     print(key,results[key])
    # Counter(result).most_common()
# tweets = json.load(open("gg2013.json"))
# winner1 = get_film_winner(tweets)
# winner2 = get_people_winner(tweets)
# winner1.update(winner2)
# print(len(winner1))
# if __name__ == '__main__':
#    mytable = (
#        ('Joe', 'Clark', '1989'),
#        ('Charlie', 'Babbitt', '1988'),
#        ('Frank', 'Abagnale', '2002'),
#        ('Bill', 'Clark', '2009'),
#        ('Alan', 'Clark', '1804'),
#        )
#    for row in sort_table(mytable, (1,0)):
#        print(row)

"""                           END of SORTER
-----------------------------------------------------------------------------
"""

psq_re_f = regex.Regex(args.regex,regex.VERBOSE|regex.MULTILINE)
psq_re_r = regex.Regex(regexrev,regex.VERBOSE|regex.MULTILINE)

ref_seq_fh = open(args.fasta)

ref_seq = []
line = (ref_seq_fh.readline()).strip()
chr = re.sub('^>', '', line)
line = (ref_seq_fh.readline()).strip()
gquad_list = []
while True:
    while line.startswith('>') is False:
        ref_seq.append(line)
        line = (ref_seq_fh.readline()).strip()
        if line == '':
            break