def serial_killer_guess(self):
        """
        Implements the Aiden-Michel serial-killer algorithm as described at

        http://dx.doi.org/10.1126/science.1199644

        http://science.sciencemag.org.ezproxy.neu.edu/content/331/6014/176.figures-only.

        I don't think this is likely to be that useful for most users;
        it's here to test the algorithm.
        """

        titles = set(re.findall(r"\w+",self.title().lower()))
        try:
            author = set(re.findall("\w+",self.first_author()["first_author_name"].lower()))
        except KeyError:
            author = set([])
            
        title_blacklist = set(["advances", "almanac", "annual", "bibliography", "biennial", "bulletin", "catalog", "catalogue", "census", "conference", "conferences", "congress", "congressional", "digest", "digest", "directory", "hearings", "index", "journal", "magazine", "meeting", "meetings", "monthly", "papers", "periodical", "proceedings", "progress", "quarterly", "report", "reports", "review", "revista", "serial", "society", "subcommittee", "symposium", "transactions", "volume", "yearbook", "yearly"])
        
        author_blacklist = set(["the", "of", "and", "administration", "congress", "international", "national", "federal", "state", "american", "british", "consortium", "university", "office", "america", "united", "states", "britain", "ireland", "canada", "australia", "institute", "research", "committee", "subcommittee", "court", "association", "foundation", "board", "bureau", "house", "senate", "dept", "department", "state", "council", "club", "school", "network", "online", "company", "co", "us", "u.s.", "survey", "agency", "academy", "commission", "press", "publishing", "publishers", "academic", "cambridge", "sciencedirect", "kluwer", "oxford", "interscience", "library", "on", "society", "service", "affairs", "division", "commerce", "public", "foreign", "government", "agriculture", "science", "engineers", "stanford", "medical", "energy", "laboratory", "economic", "geological", "assembly", "alabama", "alaska", "american", "arizona", "arkansas", "california", "colorado", "connecticut", "delaware", "columbia", "district", "florida", "georgia", "guam", "hawaii", "idaho", "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana", "maine", "maryland", "massachusetts", "michigan", "minnesota", "mississippi", "missouri", "montana", "nebraska", "nevada", "hampshire", "jersey", "mexico", "york", "ohio", "oklahoma", "oregon", "pennsylvania", "north", "south", "tennessee", "texas", "utah", "vermont", "wisconsin", "wyoming"])

        if len(titles.intersection(title_blacklist)) + len(author.intersection(author_blacklist)):
            return "serial"
        return "book"
Example #2
0
def countRepeats(read, readReverse):
    i = 0
    j = 0
    global reference
    global referenceReverse
    i = len(re.findall(str(read), str(reference), overlapped=True))
    if read != readReverse:
        j = len(re.findall(str(read), str(referenceReverse), overlapped=True))
    return i + j
Example #3
0
def pep_end(pep,seq):
	if len(findall(pep,seq)) > 1:
		runs = finditer(pep,seq)
		coord = []
		for match in runs:
			coord.append(match.end())
		return coord
	elif len(findall(pep,seq)) == 1:
		return search(pep,seq).end()
	else: return 'Not found'
Example #4
0
def pep_end(row):
	pep = row['Peptide'].replace('I','J').replace('L','J').replace('J','(I|L)')
        seq = row['Sequence']
	if len(findall(pep,seq)) > 1:
		runs = finditer(pep,seq)
		coord = []
		for match in runs:
			coord.append(match.end())
		return coord
	elif len(findall(pep,seq)) == 1:
		return [search(pep,seq).end()]
	else: return 'Not found'
Example #5
0
        def repl(match):
            # e.g. match.group(0) = {% class super %}
            unknown = True
            directive = match.group(1)
            ret = match.group(0)
            parts = [p.strip() for p in directive.split(' ') if p.strip()]
            if parts:
                if parts[0] == 'style':
                    unknown = False

                    classes_intersection = None
                    for term in parts[1:]:
                        classes = []
                        if term == 'STRUCKTWICE':
                            # <span class="T33">aiulfus</span>
                            classes = regex.findall(ur'<text:span text:style-name="([^"]+)">(?:aiulfus|7 dim)</text:span>', xml_string)
                            classes = list(set(classes))
                        elif term == 'PRO':
                            classes = regex.findall(ur'\s<text:span text:style-name="([^"]+)">p</text:span>\[ro\]\s', xml_string)
                            classes = list(set(classes))
                        else:
                            # find the class with the term <term> (e.g. super)
                            # <style:style style:name="T3" style:family="text">
                            #     <style:text-properties
                            #         style:text-position="super 58%" />
                            # </style:style>        
                            for style in regex.findall(ur'(?musi)<style:style style:name="(.*?)"[^>]*>(.*?)</style:style>', xml_string):
                                if term in style[1]:
                                    classes.append(style[0])
                    
                        if not classes:
                            raise Exception('ERROR: style not found "%s"' % term)
                        
                        if classes_intersection is None:
                            classes_intersection = classes
                        else:
                            # only keep the classes/styles that meet all keywords (AND) 
                            classes_intersection = set(classes).intersection(set(classes_intersection))
                            
                    # now remove classes which we have already used
                    already_used_warning = set(classes_intersection).intersection(set(classes_used.keys()))
                    if already_used_warning:
                        print '<!-- Already used classes/styles: %s (see above) -->' % ', '.join(list(already_used_warning))
                    classes_intersection = set(classes_intersection).difference(set(classes_used.keys())) 
                    # update the classes_used
                    for cls in classes_intersection:
                        classes_used[cls] = 1
                    
                    ret = ' or '.join([ur"@text:style-name='%s'" % cls for cls in classes_intersection])

                    print '<!-- %s => %s -->' % (parts, ret)
Example #6
0
	def getCoverageProblems(self):
		"""Verify that each rule and each exclusion has the right number of tests
			 that applies to it. TODO: Also check that each target has the right
			 number of tests. In particular left-wildcard targets should have at least
			 three tests. Right-wildcard targets should have at least ten tests.

			 Returns an array of strings reporting any coverage problems if they exist,
			 or empty list if coverage is sufficient.
			 """
		problems = self._determineTestApplication()
		# Next, make sure each rule or exclusion has sufficient tests.
		for rule in self.rules:
			needed_count = 1 + len(regex.findall("[+*?|]", rule.fromPattern))
			# Don't treat the question mark in non-capturing and lookahead groups as increasing the
			# number of required tests.
			needed_count = needed_count - len(regex.findall("\(\?:", rule.fromPattern))
			needed_count = needed_count - len(regex.findall("\(\?!", rule.fromPattern))
			needed_count = needed_count - len(regex.findall("\(\?=", rule.fromPattern))
			# Don't treat escaped questions marks as increasing the number of required
			# tests.
			needed_count = needed_count - len(regex.findall("\\?", rule.fromPattern))
			actual_count = len(rule.tests)
			if actual_count < needed_count:
				problems.append("%s: Not enough tests (%d vs %s) for %s" % (
								self.filename, actual_count, needed_count, rule))
				pass
		for exclusion in self.exclusions:
			needed_count = 1 + len(regex.findall("[+*?|]", exclusion.exclusionPattern))
			needed_count = needed_count - len(regex.findall("\(\?:", exclusion.exclusionPattern))
			needed_count = needed_count - len(regex.findall("\\?", rule.fromPattern))
			actual_count = len(exclusion.tests)
			if actual_count < needed_count:
				problems.append("%s: Not enough tests (%d vs %s) for %s" % (
								self.filename, actual_count, needed_count, exclusion))
		return problems
Example #7
0
def get_output_extension(file):
	pattern = str(file).lower()

	full_ext = re.findall(r'(\..+)+$',pattern)

	if full_ext:
		full_ext.reverse()
		full_ext = full_ext[0]

		exts = re.findall(r'\.[^.]*',full_ext)
		exts.reverse()
		return exts[1] if len(exts) > 1 else exts[0]

	return ''
Example #8
0
def pep_start(row):
	#correct leucine-isoleucine insensibility
	pep = row['Peptide'].replace('I','J').replace('L','J').replace('J','(I|L)')
	seq = row['Sequence']
	#if it matches more than once, return a list of positions
	if len(findall(pep,seq)) > 1:
		runs = finditer(pep,seq)
		coord = []
		for match in runs:
			coord.append(match.start()+1)
		return coord
	elif len(findall(pep,seq)) == 1:
		return [search(pep,seq).start()]
	else: return 'Not found'
Example #9
0
def getWordMatrix(word,model,padToMaxLength = None):
    phonemes_alone="pbmfv84tdszcnSZCjT5kgxNqGX7hlLwyr!ieaouE3"
    phonemeSearchRegex = "["+phonemes_alone+"][\"\*]?(?!["+phonemes_alone+"]~|["+phonemes_alone+"]{2}\$)|["+phonemes_alone+"]{2}?~|["+phonemes_alone+"]{3}?\$"
    phonemes = regex.findall(phonemeSearchRegex, word)
    wordVector = []
    for phoneme in phonemes:
        #if phoneme not in model, get single chars as phonemes instead
        if phoneme not in model:
            for ph in regex.findall("["+phonemes_alone+"]", phoneme):
                wordVector.append(model[ph])
        else:       
            wordVector.append(model[phoneme])    
    if padToMaxLength:
        return np.pad(np.array(wordVector),((0,padToMaxLength - len(wordVector)),(0,0)),mode="constant")
    return wordVector
Example #10
0
    def most_common_letter(self, text_wrapper):
        """
        The method get the most letter(s) per word present in
        a text. In the case that more than one letter have the
        same occurences, they are all reported back.
        args:
            text_wrapper: The text to parse
        """
        total_occurences = Counter()
        for line in text_wrapper:
            # L matches the letter \p is for matching a single point code
            # \p{L} matches any single letter
            letters = regex.findall(r'[\p{L}]', line.lower())
            total_occurences.update(letters)

        if not total_occurences:
            self.log.debug("No matches for letters or digit")
            return

        letter_occurences = total_occurences.most_common()
        # we create a list of the letter with the same occurences of the
        # first most frequent letter found
        most_common = list(filter(lambda x: x[1] == letter_occurences[0][1],
                                  letter_occurences))
        result = ""
        # concatenate the results
        for letter in most_common:
            result = result + str(letter[0]) + " "

        return result.rstrip()
Example #11
0
def times(pattern, key, row):
    found = re.findall(pattern, row[key])
    if found:
        times_apper = len(found)
    else:
        times_apper = 0  # len(row[u'Rambam'])
    return times_apper
Example #12
0
def print_result(file):

    # define regex representation of the N-glycosylation motif
    # [XY] means "either X or Y" and {X} means "any amino acid except X."
    Nmotif = r'N[^P][ST][^P]'

    # read file with list of UniProt Protein Database access IDs
    with open(file, 'r') as file:
        ids = file.readlines()

    # scan thru IDs, check each for N-glycosylation motif
    for id in ids:

        # read text data from url to a single line
        url = r'http://www.uniprot.org/uniprot/{}.fasta'.format(id.rstrip())
        data = urlopen(url).readlines()[1:]
        data = ''.join([x.rstrip().decode('utf-8') for x in data])

        # find the N-glycosylation motif in data, get all indices of the match
        match = re.findall(Nmotif, data, overlapped=True)
        match = [str(data.index(x)+1) for x in match]

        # print id and indices of the match
        if match:
            print(id.rstrip())
            print(' '.join(match))
Example #13
0
 def getString(self,word):
      #turn every vowel into generic "V"
     v = "[aeiouE3]"
     word = regex.sub(v,"V",word)
     phonemes_alone="pbmfv84tdszcnSZCjT5kgxNqGX7hlLwyr!V"
     phonemeSearchRegex = "["+phonemes_alone+"][\"\*]?(?!["+phonemes_alone+"]~|["+phonemes_alone+"]{2}\$)|["+phonemes_alone+"]{2}?~|["+phonemes_alone+"]{3}?\$"
     phonemes = regex.findall(phonemeSearchRegex, word)
     word_new = ""
     for phoneme in phonemes:
         #if phoneme not in model, get single chars as phonemes instead
         if phoneme not in self.phoneme_feature_dict:
             for ph in regex.findall("["+phonemes_alone+"]", phoneme):
                 word_new += ph
         else:       
             word_new += phoneme   
     return word_new
Example #14
0
 def findall_p_in_s(p,s):
     """"returns a series of matches for a pattern (p) in a str (s)"""""
     match_strs = regex.findall(p,s)
     #get pairs of left and right indexes
     match_indexes = [(i.start(0),i.end(0)) for i in regex.finditer(p,s)]
     all_p_in_s = [Match(match_strs[i],match_indexes[i][0],match_indexes[i][1]) for i in range(0,len(match_strs))]
     return all_p_in_s
Example #15
0
def tokenize2(text):
    """uses the letters to break the text into words
    returns a list of words"""
    # words = re.findall('[a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+', text)
    # words = re.findall('\w+', text)
    words = re.findall('\p{L}+', text)
    return words
Example #16
0
def update_post(username, slug):
    user = current_user
    content = request.form.get('content', type=str)
    cursor = request.form.get('cursor', type=int)

    if content is not None:
        post = user.posts.filter_by(slug=slug).first()
        if post:
            post.cursor = len(content) if not cursor else cursor
            post.modified_timestamp = datetime.utcnow()
            
            # Get meta
            r = regex.compile(r'<<((?:(?>[^<>]+)|<(?!<)|>(?!>))*?)>>', regex.I | regex.S)
            post.meta = json.dumps(regex.findall(r, content))
            
            # Encrypt
            half_key = session[generate_hash(user.user_key_salt)]
            key = xor_keys(half_key, app.config['MASTER_KEY'])
            content = snappy.compress(content)
            content = AES_encrypt(key, user.username, content)
            post.content = content
            
            db.session.add(post)
            db.session.commit()
            return jsonify(error=None)
        return jsonify(error="Not found")
    elif cursor is not None:
        post = user.posts.filter_by(slug=slug).first()
        if post:
            post.cursor = cursor
            db.session.add(post)
            db.session.commit()
            return jsonify(error=None)
        return jsonify(error="Not found")
    return jsonify(error="Invalid parameters")
def get_special_parameter(name,data):
    # Something like:
    # name=FID for "FID=SEL-351S-6-R107-V0-Z003003-D20011129","0958"
    # name=PARTNO for "PARTNO=0351S61H3351321","05AE"
    # name=DEVID for "DEVID=TMU 2782","0402"
    return regex.findall(r'^\"' + name + r'=([\w :+/\\()!,.\-_\\*\"]*\n)', 
        data, flags=regex.MULTILINE, overlapped=True)
Example #18
0
def get_stats_from_xml_string(xml_string, text_label="", stats=None):
    #     print 'Count - Tag'
    #     print

    els = {}
    if stats is not None:
        els = stats

    import regex as re

    #     elements = re.findall(ur'<(\w+)', xml_string)
    #     for el in set(elements):
    #         print '%8d %s' % (elements.count(el), el)

    #     print
    #     print 'Unique tag-attributes'
    #     print
    elements = re.findall(ur"<([^/!?>][^>]*)>", xml_string)
    for el in elements:
        el = el.strip()
        if el not in els:
            els[el] = {"text": text_label, "count": 1}
        else:
            els[el]["count"] += 1

    return els
Example #19
0
def get_all_pali_words():
    if "pali_words" not in cache:
        words = Counter()
            
        for file in (sc.text_dir / 'pi' / 'su' / 'mn').glob('**/*.html'):
            doc = html.parse(str(file))
            root = doc.getroot()
            for e in root.cssselect('#metaarea'):
                e.drop_tree()
            text = root.text_content()
            text = regex.sub(r'[\xad”’]', '', text)
            words_from_text = regex.findall(r'\p{alpha}+', text)
            words.update(words_from_text)
            words.update(word.rstrip('ṃ') for word in words_from_text if word.endswith('ṃ'))
        
        result = {}
        for word, count in words.most_common():
            asc_word = asciify(word)
            if not asc_word in result:
                result[asc_word] = ((word, count),)
            else:
                result[asc_word] = result[asc_word] + ((word, count),)
            
        cache["pali_words"] = result
    
    return cache["pali_words"]
Example #20
0
def main():
    gitGrep = subprocess.Popen(["git", "grep", "(\s*http.*)"],
                               stdout=subprocess.PIPE)
    for line in gitGrep.stdout:
        urls = regex.findall(URL_REGEX, line)
        for url in urls:
            gQueue.put(url[1])

    for i in range(NUM_WORKER_THREADS):
        t = threading.Thread(target=worker)
        t.daemon = True
        t.start()

    # block until we've finished all our jobs
    gQueue.join()

    # finally output each file and the corresponding URLs to fix/remove
    for key in gProcessed:
        print("%s has the following changes needed:" % (
              colored(key, 'yellow')))

        for url in gProcessed[key]:
            print("- %s" % url)

    # an empty dict evaluates as False, so we can take advantage of that to
    # return 0 when we have no matches, and 1 when there are matches
    sys.exit(gEncounteredErrors)
Example #21
0
def get_random_dna(length, max_repeat_nuc=float('inf'), invalid_patterns=None):
    '''Returns a random sequence of DNA of the supplied length,
    while adhering to a maximum number of repeating nucleotides.'''
    max_attempts = 1000
    attempts = 0

    if invalid_patterns is None:
        invalid_patterns = []

    while True:
        attempts += 1

        if attempts > max_attempts:
            raise ValueError('Unable to optimise sequence. ' +
                             'Greater than ' + str(max_repeat_nuc) +
                             ' repeating nucleotides.')

        random_dna = _get_random_dna(length)

        valid = True
        for invalid_pattern in invalid_patterns:

            if len(re.findall(invalid_pattern, random_dna, overlapped=True)) \
                    > 0:
                valid = False

        if valid and is_valid(random_dna, max_repeat_nuc):
            return random_dna

    return None
Example #22
0
def extract_features(name):
    def find(reg, str):
        res = regex.findall(reg, str, regex.I)
        if res:
            return '|'.join(sorted(res))
        else:
            return None

    return {
        'length': len(name),
        'tokens': len(regex.findall('[\w\']+', name)),
        'resolution': find('(720|1080)', name),
        'quality': find('(SDTV|HDTV|PDTV|WEB-?DL|WEBRIP|XVID|DIVX|DVDR|DVD-RIP|x264|dvd|XvidHD|AVC|AAC|VC\-?1|wmvhd|web\-dl|BRRIP|HDRIP|HDDVD|bddvd|BDRIP|webscr|bluray|bd?25|bd?50|blu-ray|BDREMUX)', name),
        '3d': bool(find('(3D)', name)),
        'subgroup': find('\[(\w+)\]', name),
        'filehash': bool(find('\[([0-9a-fA-F]{8})\]', name)),
        'season': bool(find('(S\d{1,2})', name)),
        'episode': bool(find('(E\d{1,2})', name)),
        'airdate': bool(find('((?:\d{4}[.-/ ]\d{2}[.-/ ]\d{2})|(?:\d{2}[.-/ ]\d{2}[.-/ ]\d{4}))', name)),
        'year': bool(find('[.-/ ](\d{4})[.-/ ]', name)),
        'versus': bool(find('[.-/ ](vs?)[.-/ ]', name)),
        'music': bool(find('((?:^VA(?:\-|\_|\ ))|(?:MP3|VBR|NMR|CDM|FLAC|\-(?:CDR?|EP|LP|SAT|2CD|FM|VINYL|DE|CABLE|TAPE)\-))', name)),
        'ebook': bool(find('(e?\-?book|html|epub|pdf|mobi|azw|doc|isbn)', name)),
        'comic': bool(find('(cbr|cbz)', name)),
        'magazine': bool(find('(mag(?:s|azine?s?))', name)),
        'sport': find('(epl|motogp|bellator|supercup|wtcc|bundesliga|uefa|espn|wwe|wwf|wcw|mma|ucf|fia|pga|nfl|ncaa|fifa|mlb|nrl|nhl|afl|nba|wimbledon|cricket)[\. -_]', name),
        'xxx': bool(find('(xxx|imageset|p**n|erotica)', name)),
        'game': find('(PS3|3DS|NDS|PS4|XBOX|XBONE|WII|DLC|CONSOLE|PSP|X360|PS4)', name),
        'foreign': bool(find('(seizoen|staffel|danish|flemish|dutch|Deutsch|nl\.?subbed|nl\.?sub|\.NL|\.ITA|norwegian|swedish|swesub|french|german|spanish|icelandic|finnish|Chinese\.Subbed|vostfr|Hebrew\.Dubbed|\.HEB\.|Nordic|Hebdub|NLSubs|NL\-Subs|NLSub|Deutsch| der |German | NL |\.PL\.)', name)),
        'pc': bool(find('((?:v?\d\.\d\.)|(?:x64|32bit|64bit|exe))', name)),
        'documentary': bool(find('(documentary|national geographic|natgeo)', name))
    }
 def idhit(self,
           id,
           seq,
           primerDict
           ):
     for primer in primerDict:
         #match=regex.findall("(%s){e<=5}" % (primer), str(seq))
         ## checks if more than one match! 
         #if len(match)==1:
         #ind = str(seq).index(match[0])
         
         a={id: regex.findall("(%s){e<=5}" % (primer), str(seq))}
         a={key:item for key, item in a.iteritems() if item}
             
         for key,item in a.iteritems():
             if len(item)>1:    
                 best_match=[difflib.SequenceMatcher(None, x, primer).ratio() for x in item]
                 best=item[ best_match.index(max(best_match)) ] 
                 ind = str(seq).index(best) + len(best)
             else:
                 best=item[0]
                 ind = str(seq).index(best) + len(best)
         
         if len(str(seq)[ind+20:ind+270])==250:
             self.f250[id] = str(seq)[ind+20:ind+270]
             self.f200[id] = str(seq)[ind+20:ind+220]
             self.f210[id] = str(seq)[ind+20:ind+230]
             self.f220[id] = str(seq)[ind+20:ind+240]
             self.f230[id] = str(seq)[ind+20:ind+250]
             self.f240[id] = str(seq)[ind+20:ind+260]
             return True
         
         else:
             continue
     return False
def call_C_primer(C_seq, C_primer_match_dict):

    primer_isotype = 'no_primer'
    primer_position = -1
    C_seq_without_primer = C_seq
    C_seq_length = len(C_seq)
    # series of ifs necessary for correct execution (as opposed to if / else)
    for primer_seq, primer_name in C_primer_match_dict.items():
        primer_position = C_seq[-25:].find(primer_seq)  # look only in the last 25nt for the primer seq
        if primer_position == -1:  # try to find a shortened version of the primer (for 12nt barcodes as opposed to 8)
            primer_position = C_seq[-25:].find(primer_seq[:-4])
        if primer_position == -1:  # still no exact primer match, moving to fuzzy primer search
            match = regex.findall('(%s){e<=2}'%primer_seq[:-4],C_seq[-25:],regex.BESTMATCH)  # fuzzy match allowing 2 errors
            if match != []:
                primer_position = C_seq[-25:].find(match[0])  # regex.findall returns a list
        if primer_position != -1:
            primer_isotype = primer_name
            # primer search is done on only the last 30nt of the constant
            # sequence, thus the match location must be reindexed relative to
            # the entire constant sequence
            adj_primer_pos = primer_position + C_seq_length - 25
            C_seq_without_primer = C_seq[:adj_primer_pos]
            break

    return primer_isotype, C_seq_without_primer
	def parse_readme(self, rules):
		filepath = os.path.join(self.scapegoat_dir, 'README.md')
		if not os.path.isfile(filepath):
			raise Exception('Could not open "%s"' % filepath)
		f = open(filepath, 'r')
		content = f.read()
		f.close()
		
		m = regex.search(r'\n### Inspections(.*)\n### ', content, regex.DOTALL)
		if m is None:
			return
		blocks = m.group(1).split('#####')
		for block in blocks:
			block = block.strip()
			if '|Name|' in block:
				mx = regex.findall(r'\n\|([^\|]+)\|([^\|]*)\|', block)
				for m in mx:
					key = m[0].strip()
					wikitext = m[1].strip()
					if key and key[0] != '-' and key != 'Name':
						if key not in rules:
							rules[key] = {}
						rules[key]['wikitext'] = wikitext
			else:
				(key, desc) = block.split('\n', 1)
				key = ''.join(word[0].upper() + word[1:] for word in key.split())
				desc = desc.strip()
				if key not in rules:
					rules[key] = {}
				rules[key]['description'] = desc
		if not len(rules) > 0:
			raise Exception('invalid scapegoat readme')
def f_pr02(r):
	slog=regex.findall(v+k+v,r,overlapped=True)
	tmp=d_raz[r][0]
	tpp=d_raz[r][1]
	in_granice=0
	if len(slog)!=0:
		for j in slog:
			if slog.count(j)>1:
				brk=0
				for w in r:
					if (r.find(j,brk)+1) not in tmp and ((r.find(j,brk)+1)!=0):
						tmp.append(r.find(j,brk)+1)
						tpp.append(str(r.find(j,brk)+1)+'-pr02a')
					brk+=1
					
			else:		
			
				in_granice=[match.start() for match in re.finditer(re.escape(j),r)]		
				for y in in_granice:
					if y+1 not in tmp and str(y+2)+'-pr_naj' not in tpp:
						tmp.append(y+1)
						tpp.append(str(y+1)+'-pr02b')
		
	
	tmp.sort()
	tpp.sort()
Example #27
0
def find_occurrences(needle, haystack):
    """find_occurrences."""
    for h, s in haystack:
        matches = re.findall(needle, s, overlapped=True)
        if len(matches):
            yield 1
        else:
            yield 0
Example #28
0
 def get_containing_shortcodes_set(self, data):
     result = set()
     matches = regex.findall(self._detect_any_shortcode_regex, data)
     for match in matches:
         if match[0] == '[' and match[5] == ']':
             continue
         result.add(match[1])
     return result
Example #29
0
def find_ORF(dna_sequence):
    # Create the sequence
    seq = Seq.Seq(dna_sequence)
    # Translate all 6 reading frames
    trans = [pad_seq(seq).translate(),
             pad_seq(seq[1:]).translate(),
             pad_seq(seq[2:]).translate(),
             pad_seq(seq.reverse_complement()).translate(),
             pad_seq(seq.reverse_complement()[1:]).translate(),
             pad_seq(seq.reverse_complement()[2:]).translate()]
    orfs = list()
    for t in trans:
        m = regex.findall('M\w*\*', str(t))
        if m != []:
            m = max(regex.findall('M\w*\*', str(t)), key=lambda a: len(a))
            orfs.append(m)
    return(max(orfs, key=lambda a: len(a)))
Example #30
0
	def get_characters(self):
		# 输出全部的单字
		characters = []
		reg = ur'[\u4e00-\u9fa5]'
		reg = list(set(re.findall(reg, self.content, overlapped=True)))
		for r in reg:
			characters.append(r)
		return characters		
Example #31
0
    def reorder_endnotes(self,
                         target_endnote_number: int,
                         step: int = 1) -> None:
        """
		Reorder endnotes starting at target_endnote_number.

		INPUTS:
		target_endnote_number: The endnote to start reordering at
		step: 1 to increment or -1 to decrement

		OUTPUTS:
		None.
		"""

        increment = step == 1
        endnote_count = 0
        source_directory = self.path / "src"

        try:
            endnotes_filename = source_directory / "epub/text/endnotes.xhtml"
            with open(endnotes_filename, "r+", encoding="utf-8") as file:
                xhtml = file.read()
                soup = BeautifulSoup(xhtml, "lxml")

                endnote_count = len(soup.select("li[id^=note-]"))

                if increment:
                    note_range = range(endnote_count,
                                       target_endnote_number - 1, -1)
                else:
                    note_range = range(target_endnote_number,
                                       endnote_count + 1, 1)

                for endnote_number in note_range:
                    xhtml = xhtml.replace(
                        f"id=\"note-{endnote_number}\"",
                        f"id=\"note-{endnote_number + step}\"", 1)
                    xhtml = xhtml.replace(
                        f"#noteref-{endnote_number}\"",
                        f"#noteref-{endnote_number + step}\"", 1)

                # There may be some links within the notes that refer to other endnotes.
                # These potentially need incrementing / decrementing too. This code assumes
                # a link that looks something like <a href="#note-1">note 1</a>.
                endnote_links = regex.findall(
                    r"href=\"#note-(\d+)\"(.*?) (\d+)</a>", xhtml)
                for link in endnote_links:
                    link_number = int(link[0])
                    if (link_number < target_endnote_number and
                            increment) or (link_number > target_endnote_number
                                           and not increment):
                        continue
                    xhtml = xhtml.replace(
                        f"href=\"#note-{link[0]}\"{link[1]} {link[0]}</a>",
                        "href=\"#note-{0}\"{1} {0}</a>".format(
                            link_number + step, link[1]))

                file.seek(0)
                file.write(xhtml)
                file.truncate()

        except Exception:
            raise se.InvalidSeEbookException(
                f"Couldn’t open endnotes file: [path][link=file://{endnotes_filename}]{endnotes_filename}[/][/]."
            )

        with concurrent.futures.ProcessPoolExecutor() as executor:
            for root, _, filenames in os.walk(source_directory):
                for filename in fnmatch.filter(filenames, "*.xhtml"):
                    # Skip endnotes.xhtml since we already processed it
                    if filename == "endnotes.xhtml":
                        continue

                    executor.submit(_process_endnotes_in_file, filename,
                                    Path(root), note_range, step)
Example #32
0
identity_pattern = "(?P<Mrid>M[0-9]{5}-[0-9]{5})|(?P<Prid>P[0-9]{12})"
address_pattern = "(?!.*\/).+[A-Z]{2}[ _,]{1,2}[0-9]{5}"
location_pattern = "((?P<neighborhood>[0-9 \%a-zA-Z-]+)_)?(?P<city>[a-zA-Z-]+)_(?P<state>[A-Z]{2})"
identity_contents = lambda x: {
    key: value
    for key, value in re.search(identity_pattern, x).groupdict().items()
    if value
}
location_contents = lambda x: re.search(location_pattern, x).groupdict(
) if re.search(location_pattern, x).groupdict() is not None else {}
additive_reduction = lambda x, y: x + y
lifo_reduction = lambda x, y: x
identity_parser = lambda x: identity_contents(x)[
    "Mrid"] if "Mrid" in identity_contents(x).keys() else identity_contents(x)[
        "Prid"]
data_parser = lambda x: str(re.findall(address_pattern, x)[0]).replace(
    "_", ", ").replace("-", " ")
address_parser = lambda x: str(Address.fromsearch(x))
flood_parser = lambda x: str(x).replace("/", "|").strip() if not any(
    [str(x) == "N|A", str(x) == "N/A"]) else ""
noise_parser = lambda x: str(x) if not any([str(x) == "N|A",
                                            str(x) == "N/A"]) else ""
location_parser = lambda x: {
    key: str(value).replace("%20", " ")
    for key, value in location_contents(x).items()
}
space_parser = lambda x: str(Space.fromsearch("\n".join(list(x))))
price_parser = lambda x: str(Price.fromsearch(x))
date_parser = lambda x: Datetime.today().date() if str(x).lower(
) == "today" else Datetime.strptime(x, "%m/%d/%Y").date()
link_parser = lambda x: "".join(["https://www.realtor.com", x]) if not str(
Example #33
0
 def encode(self, text):
     bpe_tokens = []
     for token in re.findall(self.pat, text):
         token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
         bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
     return bpe_tokens
Example #34
0
                                 mode='r',
                                 encoding='utf-8')
    except FileNotFoundError:
        print("\t{0} not found.".format(items_file_name))
        continue

    items = json.load(items_file)
    print("{0} loaded.".format(items_file_name))

    items_file.close()

    for item in items:
        if item["tr_text"] != "" and item["tr_explain"] == "":
            item["tr_explain"] = "Unlocks the "

            if len(regex.findall("女性のみ使用可能。", item["jp_explain"])) > 0:
                item["tr_explain"] += "female-only "
            elif len(regex.findall("男性のみ使用可能。", item["jp_explain"])) > 0:
                item["tr_explain"] += "male-only "

            item["tr_explain"] += (item_type + "\n\"" + item["tr_text"] +
                                   "\"\n" + "for use in the Beauty Salon.")

            print("Translated description for {0}".format(item["tr_text"]))

    items_file = codecs.open(os.path.join(json_loc, items_file_name),
                             mode='w',
                             encoding='utf-8')
    json.dump(items,
              items_file,
              ensure_ascii=False,
Example #35
0
s_p_adres['adres'] = s_p_adres['adres'].apply(
    lambda x: re.sub('(.+?)( .+)', r'\1', x))
s_p_adres['len'] = s_p_adres['adres'].apply(lambda x: len(x))
s_p_adres['adres'] = s_p_adres['adres'].apply(
    lambda x: x.replace('http://www.', 'http://'))
s_p_adres['id'] = s_p_adres['id'].astype(int)
s_p_adres = s_p_adres.sort_values(['id', 'len']).drop_duplicates()

test = s_p_adres.groupby('id').head(1).reset_index(drop=True).drop(
    columns='len')

serwis_nazwa = serwisy_portale.copy()[['id', 'nazwa']]

podpola_marc = []
for e in serwis_nazwa['nazwa']:
    val = re.findall('(?<=^| ).(?= )', e)
    for el in val:
        if el.isalpha():
            podpola_marc.append(el)

podpola_marc = list(set(podpola_marc))

serwis_nazwa['nazwa'] = serwis_nazwa['nazwa'].str.replace('b |p |c |a ', '')

czasopisma = gsheet_to_df('1EWzb9mCsTVxYDqj_CzKW5EcqJy4z3a_-GyNGAbAjGH0',
                          'Czasopisma (finalny)')
czasopisma_fin = czasopisma.copy()[['id', 'adres']]
czasopisma_fin = cSplit(czasopisma_fin, 'id', 'adres', '❦')
czasopisma_fin = czasopisma_fin[czasopisma_fin['adres'].notnull()]

czasopisma_fin = czasopisma_fin[czasopisma_fin['adres'].str.contains('http')]
Example #36
0
def feature_extractor(x):
    X_high = list(
        map(lambda x: x.lower().strip().replace("  ", " ").replace("ё", "е"),
            regex.findall("[А-ЯЁ]+[IА-ЯЁ\ ]+", x)))[0]

    pos_types = [
        'ADJF', 'ADJS', 'ADVB', 'COMP', 'CONJ', 'GRND', 'INFN', 'INTJ', 'NOUN',
        'NPRO', 'NUMR', 'None', 'PRCL', 'PRED', 'PREP', 'PRTF', 'PRTS', 'VERB'
    ]

    correct_words = [
        'их', 'пятисот', 'поезжайте', 'полощущий', 'благородства', 'зажжёт',
        'директора', 'его', 'погон', 'повара', 'значительный', 'нужный',
        'высоко', 'оденьтесь', 'высохла', 'инженеры', 'шестистах',
        'пятидесяти', 'девятистам', 'промок', 'ярче', 'дёсны', 'семьюдесятью',
        'тысяча', 'ближайшая', 'носков', 'тихо', 'конструкторы', 'их',
        'четырьмястами', 'семьюстами', 'обеими', 'едкий', 'более громко',
        'легчайшее', 'вафель', 'промок', 'полотенец', 'ложись', 'лягте',
        'графику', 'пятистам', 'ботинок', 'конюхи', 'шестьюстами', 'возрасты',
        'супы', 'их', 'две', 'плеч', 'восьмидесяти', 'заглохший', 'лягте',
        'трое', 'им', 'семьюдесятью', 'пятисот', 'тетерева', 'жёстче',
        'директора', 'сплетен', 'доходчиво', 'поезжай', 'полутора',
        'более глубоко', 'свитеры', 'глубочайшее', 'директора', 'сбережёт',
        'поезжай', 'титулы', 'более просто', 'доктора', 'стрижёт', 'трёхстах',
        'обеих', 'щупалец', 'в двухстах', 'обеих', 'месяцы', 'обеих',
        'пятистах', 'плещет', 'поезжай', 'лёгкая', 'брызг', 'более резкое',
        'сотен', 'надёжный', 'полутора', 'две', 'свитеры', 'более высоко',
        'георгинов', 'обоих', 'надеть', 'абрикосов', 'стрижёт', 'богинь',
        'выговоры', 'наиболее', 'апельсинов', 'испечём', 'шестьюстами',
        'всех быстрее', 'шофёры', 'четверо', 'богаче', 'туфель', 'их', 'басен',
        'поезжай', 'барышень', 'шофёры', 'обеих', 'сардин', 'полотенец', 'две',
        'завидую', 'абрикосов', 'тренеры', 'колокола', 'аэропорту',
        'насмехаться', 'наилучший', 'течёт', 'смелее', 'обоих', 'обеим',
        'тремястами', 'бухгалтеры', 'красивый', 'лягте', 'сапог', 'падежей',
        'мокла', 'клади', 'их', 'солдат', 'туфля', 'выздоровеешь',
        'шестидесяти', 'восьмисот', 'профессора', 'положи', 'кухонь',
        'профессора', 'восьмисот', 'поезжай', 'полотенец', 'их', 'длинной',
        'поскользнулся', 'положи', 'колышет', 'до тысяча девятьсот пятого',
        'сапог', 'пламени', 'более уверенно', 'ездите', 'уместный',
        'двухстах шестидесяти', 'яслей', 'восемьсот', 'пятидесятая', 'выше',
        'строжайше', 'пробуя', 'помидоров', 'худший', 'двухстах', 'тонкий',
        'шкафу', 'критериев', 'парикмахеры', 'лёгкий', 'семьюстами', 'катера',
        'поскользнуться', 'ляг', 'восьмьюдесятью восьмью', 'семьюстами',
        'инженеры', 'восьмисотому', 'ботинок', 'слаще', 'директора',
        'интереснее всех', 'полотенец', 'бойче', 'пятистах', 'кухонь',
        'более крупная', 'поезжайте', 'добрейший', 'поезжай', 'кухонь',
        'клапаны', 'килограммов', 'серьёзнейшее', 'шестьсот', 'доктора',
        'граммов', 'шестьюдесятью', 'прополощи', 'двое', 'дольше', 'звонче',
        'заезжай', 'их', 'поезжайте', 'шампуня', 'решению', 'яблонь',
        'человек', 'носков', 'деревни', 'поезжай', 'ответственный', 'громко',
        'богатейший', 'полтораста', 'кочерёг', 'пятисот', 'две тысячи шестом',
        'семьюстами', 'яслей', 'широко', 'свежая', 'свечей', 'девяноста',
        'лягте', 'предупрежу', 'менее привлекательный', 'директора', 'лекторы',
        'ездит', 'помашите', 'облаков', 'чулок', 'трёмстам', 'серёг',
        'мандаринов', 'красивее', 'пять', 'почерк', 'пятьюстами',
        'две тысячи пятом', 'пятистах', 'тридцати пяти', 'одень', 'приезде',
        'блюдец', 'килограммов', 'чище', 'двухсот', 'кухонь', 'аршин', 'яблок',
        'обоих', 'красивейший', 'поезжай', 'комментариев', 'грузин', 'блюдец',
        'помидоров', 'шестьюстами', 'баклажанов', 'лагеря', 'помидоров',
        'макарон', 'георгинов', 'длиннее', 'озяб', 'полутора',
        'времяпрепровождение', 'поезжай', 'триста пятьдесят', 'приговоры',
        'трехстах', 'две тысячи', 'две', 'светлый', 'высохла', 'жаждет',
        'тысяча', 'макарон', 'мяукает', 'тремястами', 'звонче', 'клади',
        'шестьюдесятью', 'ста двадцатью пятью', 'вишен', 'чудеснейший',
        'наисложнейшая', 'выкладывай', 'туфель', 'изрешечённый', 'консервов',
        'яблок', 'обоими', 'помидоров', 'оладий', 'полотенец', 'поезжайте',
        'заморозков', 'две', 'тысяча восьмисотом', 'приобретший',
        'семьюдесятью', 'более весело', 'наиболее сильный', 'более сильньй',
        'смотря', 'чулок', 'передайте', 'помашите', 'ихние', 'полутораста',
        'трое', 'ей', 'крепкий', 'восьмьюдесятью', 'цапель', 'обгрызенных',
        'звонче', 'их', 'шестидесяти', 'яблок', 'консервов', 'четверо',
        'выгодно', 'лажу', 'осетин', 'пятисотым', 'договоры', 'семисот',
        'более надёжный', 'чудеснейшим', 'выразительно', 'грунте', 'новый',
        'макарон', 'граммов', 'полутораста', 'их', 'наисложнейший', 'семью',
        'наилучший', 'гнездовий', 'умолк', 'менее значимый', 'вишен', 'их',
        'более разговорчивым', 'наилучших', 'поезжайте', 'кладите', 'пятисот',
        'наивысшее', 'их', 'паспорта', 'соболий', 'абрикосов', 'обеих',
        'прессы', 'граммов', 'седьмых', 'паспорта', 'лягте', 'барышень',
        'векторы', 'их', 'тысяча девятьсот двадцать пятом', 'лжём', 'джемперы',
        'полощет', 'долго', 'дешевле', 'солдат', 'ста', 'девятьюстами',
        'снайперы', 'шампунем', 'шестьюдесятью', 'выздоровею', 'шкафу',
        'повеселее', 'более просто', 'тюля', 'абрикосов', 'джинсов',
        'пятистам', 'возврата', 'детьми', 'нелепейший', 'возрасты', 'джинсов',
        'их', 'сладчайший', 'их', 'обоих', 'килограммов', 'их', 'вкуснее',
        'абзацев', 'джинсов', 'аэропорте', 'строгий', 'наиболее решительно',
        'поезжайте', 'две тысячи', 'их', 'шампунем', 'скорейшего', 'яблонь',
        'ботинок', 'оладий', 'шестьюстами', 'падежей', 'девяноста', 'партизан',
        'лекторы', 'более красивый', 'паспорта', 'граммов', 'обеих', 'надень',
        'лучший', 'башен', 'полотенец', 'трёхстах', 'профессора', 'спален',
        'договоры', 'подожжёшь', 'шестисот', 'инженеры', 'паспорта',
        'двумястами', 'четырьмястами', 'полезно', 'интереснее', 'выборы',
        'двенадцатого', 'договоры', 'семисот', 'семистах', 'кратчайший', 'его',
        'выйди', 'бухгалтеры', 'туфель', 'ответ', 'сердец', 'лжёте',
        'поезжайте', 'поместий', 'ботинок', 'пропуска', 'наиболее', 'исчез',
        'боку', 'наиболее уместный', 'надеть', 'их', 'договоры', 'абзацев',
        'сотрёт', 'выздоровеет', 'крошечный', 'молодой', 'мандаринов',
        'четырьмястами', 'четверых', 'две тысячи пятому', 'попробует', 'ярко',
        'туфля', 'времени', 'пятьюдесятью', 'их', 'две тысячи семнадцатому',
        'джинсов', 'платьев', 'яблок', 'шестьюстами', 'кастрюль', 'его',
        'ветру', 'торты', 'поезжайте', 'полутора', 'попробуем', 'их', 'открыв',
        'красивейший', 'серёг', 'двухтысячном', 'движет', 'обгрызенная',
        'печёт', 'килограммов', 'напоив', 'тонн', 'две тысячи',
        'наиболее точно', 'сорта', 'попробует', 'с детьми', 'наиболее',
        'поезжай', 'пояса', 'полощет', 'четырьмястами', 'носков', 'зажжётся',
        'поезжайте', 'профессора', 'обоих', 'их', 'ножниц', 'именин',
        'более долго', 'две тысячи шестому', 'вишен', 'паспорта', 'потолще',
        'взгляни', 'сапог', 'посуше', 'восьмисотом', 'четырёх тысяч',
        'поезжайте', 'сильнее', 'выправь', 'искомый', 'моложе', 'красивее',
        'обоих', 'колышется', 'посади', 'исчезнув', 'барышень',
        'менее интересно', 'их', 'недра', 'директора', 'тарифу',
        'более слабый', 'князья', 'звонче', 'зажжётся', 'полиса', 'пихт',
        'лягте', 'полутора', 'погон', 'девятьюстами', 'полутораста', 'лезь',
        'клади', 'четырьмястами', 'попробуем', 'лягте', 'каникул', 'титулы',
        'всего', 'лазайте', 'поскользнулся', 'тысячу', 'контейнеры', 'умный',
        'шорт', 'поезжай', 'поезжайте', 'их', 'три'
    ]

    pos_dict = dict()

    for pos_ in pos_types:
        pos_dict[pos_] = 0

    features = [
        len(x),
        len(x.split()),
        len(X_high),
        len(X_high.split()),
        rnc_unigrams[X_high.lower()] if X_high.lower() in rnc_unigrams else 10
    ]

    for word_ in x.lower().split():
        pos_dict[str(morph.parse(word_.lower())[0].tag.POS)] += 1
    pos_features = list(pos_dict.values())

    features.append(X_high.lower() in correct_words)

    features.append(X_high.lower() != solver_7_reconstructor(x))

    return np.array(features + pos_features).astype(int)
Example #37
0
def having_at_symbol(url):
    symbol = regex.findall(r'@', url)
    if (len(symbol) == 0):
        return -1
    else:
        return 1
Example #38
0
 def vocab_level(self, vocab):
     return max(
         int(self.hanzi_level(h))
         for h in regex.findall(r'\p{IsHan}', vocab))
Example #39
0
def oneMismatch(id,seq):
 restring='('+id+'){e<=1}'
 m=regex.findall(restring,seq,overlapped=False)
 return m
Example #40
0
 def test_regex(self):
     import regex
     self.assertEqual(regex.findall(r'(?i)(a)(b)', 'ab cd AB 1a1b'),
                      [('a', 'b'), ('A', 'B')])
     self.assertEqual(regex.escape('a b', literal_spaces=True), 'a b')
Example #41
0
            info = driver.find_element_by_xpath(
                f'//*[@id="main-content"]/div/div/section/div[1]/div/div[{i}]/div/div[2]'
            ).text
            if 'snack' in info or 'SNACK' in info:
                print('skip')
                i += 1
                continue
            else:
                timeis = driver.find_element_by_xpath(
                    f'//*[@id="main-content"]/div/div/section/div[1]/div/div[{i}]/div/div[2]/span[1]'
                ).text
                serve = driver.find_element_by_xpath(
                    f'//*[@id="main-content"]/div/div/section/div[1]/div/div[{i}]/div/div[2]/span[2]'
                ).text
                # print(timeis)
                serveT = re.findall(r'\d+', serve)[0]
                # print(serveT)

        except:

            num += 1
            i = 1
            continue

        for word in wordlist:
            if word in title:
                print("ERROR")
                ERROR = True
                break
            # check number
        if (re.search(r'\d', title)):
Example #42
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import regex

result = re.findall("[a-z]\d", "a1b2c3")
print result
print '*' * 30

sample = "taatbbtctdd"
result = re.findall("t..", sample)
print result  # ['taa', 'tbb', 'tct']

result = regex.findall("t..", sample, overlapped=True)
print result
print '*' * 30

sample = "Customer number: 232454, Date: February 12, 2011"
nums = re.findall("[0-9]+", sample)
print nums
print '*' * 30

result = re.findall('(\D)([\d.]+)', u'语80.5外92数96.5')
print repr(result).decode('unicode-escape')
print '*' * 30

s = ('''
    <ul>
        <li>something</li>
        <li>something else</li>
Example #43
0
def counteur(check, pattern):
    #	print("counteur: ", check)
    #	print("pattern: ", pattern)
    return len(re.findall(pattern, check, overlapped=True))
Example #44
0
def _extract_from_html(msg_body):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.

    Cut out first some encoding html tags such as xml and doctype
    for avoiding conflict with unicode decoding

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.

    Then use plain text algorithm to cut out splitter or
    leftover quotation.
    This works by adding checkpoint text to all html tags,
    then converting html to text,
    then extracting quotations from text,
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
    if msg_body.strip() == b'':
        return msg_body

    msg_body = msg_body.replace(b'\r\n', b'\n')

    # Required for python3
    if isinstance(msg_body, bytes):
        msg_body = msg_body.decode('utf8')

    msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)

    html_tree = html_document_fromstring(msg_body)

    if html_tree is None:
        return msg_body

    cut_quotations = (html_quotations.cut_gmail_quote(html_tree)
                      or html_quotations.cut_zimbra_quote(html_tree)
                      or html_quotations.cut_blockquote(html_tree)
                      or html_quotations.cut_microsoft_quote(html_tree)
                      or html_quotations.cut_by_id(html_tree)
                      or html_quotations.cut_from_block(html_tree))
    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
    plain_text = html_tree_to_text(html_tree)
    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
        return msg_body

    # Collect checkpoints on each line
    line_checkpoints = [
        [
            int(i[4:-4])  # Only checkpoint number
            for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)
        ] for line in lines
    ]

    # Remove checkpoints
    lines = [
        re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines
    ]

    # Use plain text quotation extracting algorithm
    markers = mark_message_lines(lines)
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags

    if not lines_were_deleted and not cut_quotations:
        return msg_body

    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in range(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True

        # Remove tags with quotation checkpoints
        html_quotations.delete_quotation_tags(html_tree_copy, 0,
                                              quotation_checkpoints)

    if _readable_text_empty(html_tree_copy):
        return msg_body

    # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
    # parsers do not recognize namespaces in HTML tags. As such the rendered
    # HTML tags are no longer recognizable HTML tags. Example: <o:p> becomes
    # <oU0003Ap>. When we port this to golang we should look into using an
    # XML Parser NOT and HTML5 Parser since we do not know what input a
    # customer will send us. Switching to a common XML parser in python
    # opens us up to a host of vulnerabilities.
    # See https://docs.python.org/3/library/xml.html#xml-vulnerabilities
    #
    # The down sides to removing the namespaces is that customers might
    # judge the XML namespaces important. If that is the case then support
    # should encourage customers to preform XML parsing of the un-stripped
    # body to get the full unmodified XML payload.
    #
    # Alternatives to this approach are
    # 1. Ignore the U0003A in tag names and let the customer deal with it.
    #    This is not ideal, as most customers use stripped-html for viewing
    #    emails sent from a recipient, as such they cannot control the HTML
    #    provided by a recipient.
    # 2. Preform a string replace of 'U0003A' to ':' on the rendered HTML
    #    string. While this would solve the issue simply, it runs the risk
    #    of replacing data outside the <tag> which might be essential to
    #    the customer.
    remove_namespaces(html_tree_copy)
    return html.tostring(html_tree_copy)
Example #45
0
def extract_proper_noun(txt):
    return re.findall(r"(?<!^|\. |\.  )[A-Z][a-z]+", txt)
Example #46
0
 for single_date in search_date:
     db_name = '7m_matchs_' + single_date.replace(
         '-', '_')
     db = client[db_name]  # 获得数据库的句柄
     coll_match_list = db.collection_names()
     match_info_dict = {
         'support_total_right': 0,
         'support_total_netRate': 0,
         'support_total_num': 0,
         'match_info_list': [],
     }
     for coll_match_name in coll_match_list:
         # 找到是整场比赛的集合
         if len(
                 regex.findall(
                     r'match_',
                     coll_match_name)) == 0:
             continue
         coll = db[
             coll_match_name]  # 获得collection的句柄
         match_id = coll_match_name.split('_')[-1]
         for single_match_dict in coll.find():
             # 遍历当天所有比赛
             league_name = single_match_dict[
                 'league_name']
             home_name = single_match_dict[
                 'home_name']
             away_name = single_match_dict[
                 'away_name']
             start_time = single_match_dict[
                 'start_time']  # 如:2018-01-12 18:00
Example #47
0
def find_keywords_in_graph(vba_func_dict, DG):
    """Find and highlight possible malicious keywords in graph
    
    Args:
        vba_func_dict (dict[func_name]=func_code): Functions dictionary
        DG (networkx.DiGraph): Generated directed graph
    
    Returns:
        networkx.DiGraph: Directed Graph with keywords highlighted in red
    """
    # analyze function calls
    for func_name in vba_func_dict:

        func_code = vba_func_dict[func_name]
        # split function code into lines
        func_code_lines = filter(None, re.split("\n", func_code))

        # handle malicious keywords
        keywords_re_sensetive = "(" + ")|(".join(lst_mal_case_sensetive) + ")"
        keywords_re_insensetive = "(" + ")|(".join(lst_mal_case_insensetive) + ")"

        # iterate over all the words in func_code and match mal_regexes
        dict_items = {}
        for token in func_code_lines:
            match_findall_sensetive = re.findall(keywords_re_sensetive, token)
            match_findall_insensetive = re.findall(keywords_re_insensetive, token, re.IGNORECASE)
            match_findall = match_findall_sensetive + match_findall_insensetive
            if match_findall:
                for match in match_findall:
                    match_list = list(match)

                    # use dictionary dict_items to count occurances of keywords
                    for list_item in match_list:
                        if list_item != "":
                            if list_item not in dict_items:
                                dict_items[list_item] = 1
                            else:
                                dict_items[list_item] = dict_items[list_item] + 1

        # add keywords to graph
        for dic_key in dict_items:
            if dic_key in lst_obfuscation_keywords:
                keyword_color = color_scheme["COLOR_OBFUSCATION_KEYWORD"]
            else:
                keyword_color = color_scheme["COLOR_REGULAR_KEYWORD"]

            keyword_count = dict_items[dic_key]
            if DG.node[func_name]["keywords"] != "":
                DG.node[func_name]["keywords"] = DG.node[func_name]["keywords"] + ","
                DG.node[func_name]["n_calls"] = DG.node[func_name]["n_calls"] + ","

            DG.node[func_name]["keywords"] = DG.node[func_name]["keywords"] + "<font color='" + keyword_color + "'>" + dic_key + "[" + str(keyword_count) + "]" + "</font>"

            call = {'name': dic_key,
                    "count": keyword_count,
                    "sensitivity": keyword_color,}

            if DG.node[func_name]["calls"]:
                DG.node[func_name]["calls"].append(call)
            else:
                DG.node[func_name]["calls"] = [call]

        # handle autorun keywords
        keywords_re = "(" + ")|(".join(lst_autorun) + ")"
        if re.match(keywords_re, func_name, re.IGNORECASE):
            DG.node[func_name]["color"] = color_scheme["COLOR_AUTORUN_FUNCTIONS"]

    return DG
Example #48
0
    print("Analyzing:", title)

    # Scrapping html data for a page with Selenium
    URL = 'https://www.hooktheory.com' + song
    PATH = '/Users/connorobrien/Documents/Coding/Resources/chromedriver'
    driver = webdriver.Chrome(PATH)
    driver.get(URL)
    element = driver.find_element_by_class_name('ng-isolate-scope')
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    time.sleep(10)
    src = driver.page_source
    driver.quit()

    # Getting the key
    keys = re.findall(
        r'(<tspan class="gotham">)([\w])(<\/tspan><\/div><div class="secondary">)(\w\w\w)(<\/div>)',
        src)
    sharp = re.findall(
        r'<div class="primary"><tspan class="gotham">([\w])<\/tspan><tspan class="scale-degrees" dx="0\.05ex">(.)<\/tspan><\/div><div class="secondary">(\w\w\w)<\/div>',
        src)
    if len(sharp) == 0:
        key = keys[0][1] + keys[0][3]
    else:
        key = sharp[0][0] + sharp[0][1] + sharp[0][2]
    key_list.append(key)
    chord_list_full.append(key)

    ## Getting the main chords
    chords = re.findall(
        r'(<tspan class="times">([\w])<\/tspan>)?(<tspan class="times">)(\w)(<\/tspan>)(((<tspan class="times">)(\w)(<\/tspan>)(<\/text>))|(<\/text>))',
        src)
SNV_RS['rsID'] = SNV_RS['rsID'].str.replace('rs',"")



SNV_RS.to_csv('SNV_strip.csv', sep='\t', encoding = 'utf-8')
BE_CtoT_editable = SNV_RS[SNV_RS['Variant cDNA name'].str.contains("T>C")]
BE_GtoA_editable = SNV_RS[SNV_RS['Variant cDNA name'].str.contains("A>G")]
ABE_TtoC_editable = SNV_RS[SNV_RS['Variant cDNA name'].str.contains("C>T")]
ABE_AtoG_editable = SNV_RS[SNV_RS['Variant cDNA name'].str.contains("G>A")]



handle = open('mart_export.txt' , "rU")
Flanks={}
for record in SeqIO.parse(handle, "fasta") :
	Flanks[int(record.id.split("|")[3].strip('rs'))]=regex.findall('.{25}[^A,T,C,G].{25}', record.seq.tostring())
handle.close()
F=pd.DataFrame({'rsID': list(Flanks.keys()), 'Flanks': [x for x in Flanks.values()]})

F['rsID'] =F['rsID'].astype(str)
SNV_RS['rsID'] =SNV_RS['rsID'].astype(str)
ABE_AtoG_editable = F.merge(ABE_AtoG_editable, left_on='rsID', right_on='rsID', how='right')
ABE_TtoC_editable = F.merge(ABE_TtoC_editable, left_on='rsID', right_on='rsID', how='right')
BE_CtoT_editable = F.merge(BE_CtoT_editable, left_on='rsID', right_on='rsID', how='right')
BE_GtoA_editable = F.merge(BE_GtoA_editable, left_on='rsID', right_on='rsID', how='right')


ABE_AtoG_editable['gRNAs']=None
ABE_AtoG_editable['gRNAall']=None
for i in range(len(ABE_AtoG_editable)):
    if type(ABE_AtoG_editable.iloc[i].Flanks)==list and ABE_AtoG_editable.iloc[i].Flanks!=[]:
Example #50
0
def count(x: str, y: str, overlapped: bool = False) -> int:
    return len(regex.findall(x, y, overlapped=overlapped))
Example #51
0
def git_commit_info():
    git = Repo('.')
    commit = git.get_object(git.head())
    return {'id': commit.id.decode("utf-8")[0:7], 'id_full': commit.id.decode("utf-8"),
            'author': regex.findall("(.*?) <(.*?)>", commit.author.decode("utf-8"))[0],
            'message': commit.message.decode("utf-8").strip('\r\n').split('\n')[0]}
Example #52
0
File: VAE.py Project: marlonbetz/BA
geo_info = dict()
for i,line in enumerate(codecs.open(pathToASJPCorpusFile,"r","utf-8")):
    if i > 0:
        line = line.split("\t")
        if "PROTO" not in line[0] and "ARTIFICIAL" not in line[2]:
            words = line[10:]
            #remove invalid characters
            for word in words:
                word = word.replace("%","")
            """
            for cells with more than one corresponding word, add that word as new entry
            """
            tba = []
            for i_w,word in enumerate(words):
                if "," in word:
                    for match in  regex.findall("(?<=,).+",word):          
                        tba.append(match)
                    #reduce entry to first occurence of seperator
                    words[i_w] = word[:word.index(",")]
            words.extend(tba)
            allWords.extend(words)
            geo_info[line[0]] = [line[5],line[6]]
    
"""
EXTRACT ALL PHONEMES AND ADD WORD BOUNDARIES AND GET RID OF EMPTY STRINGS
"""
print("EXTRACT ALL PHONEMES AND ADD WORD BOUNDARIES AND GET RID OF EMPTY STRINGS")
allWords = [["<s>"]+getListofASJPPhonemes(word)+["</s>"] for word in allWords if len(word) > 0]
 
"""
COUNT PHONEMES
Example #53
0
 def extract_song_snippet(self):
   pattern = '\n\n(.*?)\n\n'
   search_results = regex.findall(pattern, self.abc_text, overlapped=True, flags=regex.DOTALL)
   self.songs = [song for song in search_results]
   print ("Found {} possible songs in generated texts".format(len(self.songs)))
Example #54
0
 def test_regex(self):
     import regex
     self.assertEqual(regex.findall(r'(?i)(a)(b)', 'ab cd AB 1a1b'),
                      [('a', 'b'), ('A', 'B')])
Example #55
0
        fetch_properties, fetch_owners = False, True
    else:
        parser = argparse.ArgumentParser(description='What to fetch')
        parser.add_argument('--properties', help='fetch properties', dest='properties', action='store_true', required=False)
        parser.add_argument('--owners', help='fetch owners', dest='owners', action='store_true', required=False)
        parser.set_defaults(owners=False, properties=False)
        args   = parser.parse_args()
        fetch_properties, fetch_owners = args.properties, args.owners
    
    data_dir   = f'{ROOT_DIR}/data/data_{CNTY_SFFX}'
    output_dir = f'{ROOT_DIR}/output/output_{CNTY_SFFX}'
    os.makedirs(data_dir, exist_ok=True)

    records_url   = 'http://www.jonescad.org/open_records.htm'
    response      = requests.get(records_url)
    href_lines    = re.findall(r'<a href="open records/[^>]*>', response.text)
    filenames     = [tag.strip(r'<a href="').strip(r'">"').split('"')[0] for tag in href_lines]
    tax_filenames = [fn for fn in filenames if re.match(".*real.*roll.*\.xls", fn.lower())]
    
    if not tax_filenames:
        sys.stderr.write('No appraisal file in county open records!')
        
    url_appraisal_roll = 'http://www.jonescad.org/'+tax_filenames[-1]
    url_appraisal_roll = url_appraisal_roll.replace('&amp;', '&')
    response           = requests.get(url_appraisal_roll)
    fname              = f'{data_dir}/appraisal_roll.xls'

    with FileLock(fname) as f_lock: # preventing simultaneous dumping
        with open(fname, 'wb') as f:
            f.write(response.content)
            
Example #56
0
 def findUrl(self, text=""):
     if not text:
         text = self.text
     return regex.findall(r"http[s]*?://\S+\.\S+", text)
Example #57
0
import regex as re
nums = list()
print(
    sum([
        int(num) for num in (re.findall('[0-9]+', (open(
            "C:\\Users\\Pandinha\\Desktop\\Projects\\Python\\ReGeX\\test2.txt"
        ).read())))
    ]))
"""
nums = []
file = open("C:\\Users\\Pandinha\\Desktop\\Projects\\Python\\ReGeX\\test1.txt")
for texts in file:
    text = re.findall('[0-9]+', texts)
    if len(text) != 0:
        for num in text:
            number = int(num)
            nums.append(number)
print(sum(nums))
"""
# result test1 : 445833
# result test2 : 369277
Example #58
0
 def findEmail(self, text=""):
     if not text:
         text = self.text
     return regex.findall(r"\S*@\S*", text)
Example #59
0
File: VAE.py Project: marlonbetz/BA
def getListofASJPPhonemes(word):
    phonemes_alone="pbmfv84tdszcnSZCjT5kgxNqGX7hlLwyr!ieaouE3"
    phonemeSearchRegex = "["+phonemes_alone+"][\"\*]?(?!["+phonemes_alone+"]~|["+phonemes_alone+"]{2}\$)|["+phonemes_alone+"]{2}?~|["+phonemes_alone+"]{3}?\$"
    return regex.findall(phonemeSearchRegex, word)
Example #60
0
    def recompose(self, output_xhtml5: bool) -> str:
        """
		Iterate over the XHTML files in this epub and "recompose" them into a single XHTML string representing this ebook.

		INPUTS
		output_xhtml5: true to output XHTML5 instead of HTML5

		OUTPUTS
		A string of HTML5 representing the entire recomposed ebook.
		"""

        # Get the ordered list of spine items
        with open(self.metadata_file_path, "r", encoding="utf-8") as file:
            metadata_soup = BeautifulSoup(file.read(), "lxml")

        # Get some header data: title, core and local css
        title = html.escape(metadata_soup.find("dc:title").contents[0])
        css = ""
        with open(self.path / "src" / "epub" / "css" / "core.css",
                  "r",
                  encoding="utf-8") as file:
            css = file.read()

        with open(self.path / "src" / "epub" / "css" / "local.css",
                  "r",
                  encoding="utf-8") as file:
            css = css + "\n\n\n/* local.css */" + file.read()

        namespaces = set(regex.findall(r"@namespace.+?;", css))

        css = regex.sub(r"@(charset|namespace).+?;", "", css).strip()

        if namespaces:
            css = "\n" + css

        for namespace in namespaces:
            css = namespace + "\n" + css

        css = "\t\t\t".join(css.splitlines(True))

        output_xhtml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" epub:prefix=\"z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0\"><head><meta charset=\"utf-8\"/><title>" + title + "</title><style/></head><body></body></html>"
        output_soup = BeautifulSoup(output_xhtml, "lxml")

        # Iterate over spine items in order and recompose them into our output
        for element in metadata_soup.select("spine itemref"):
            filename = metadata_soup.select(
                f"item[id=\"{element['idref']}\"]")[0]["href"]

            with open(self.path / "src" / "epub" / filename,
                      "r",
                      encoding="utf-8") as file:
                xhtml_soup = BeautifulSoup(file.read(), "lxml")

                for child in xhtml_soup.select("body > *"):
                    self._recompose_xhtml(child, output_soup)

        # Add the ToC after the titlepage
        with open(self.path / "src" / "epub" / "toc.xhtml",
                  "r",
                  encoding="utf-8") as file:
            toc_soup = BeautifulSoup(file.read(), "lxml")
            output_soup.select("#titlepage")[0].insert_after(
                toc_soup.find("nav"))

        # Get the output XHTML as a string
        output_xhtml = str(output_soup)
        output_xhtml = regex.sub(r"\"(\.\./)?text/(.+?)\.xhtml\"", "\"#\\2\"",
                                 output_xhtml)
        output_xhtml = regex.sub(r"\"(\.\./)?text/.+?\.xhtml#(.+?)\"",
                                 "\"#\\2\"", output_xhtml)

        # Replace SVG images hrefs with inline SVG
        for match in regex.findall(r"src=\"../images/(.+?)\.svg\"",
                                   output_xhtml):
            with open(self.path / "src" / "epub" / "images" / (match + ".svg"),
                      "r",
                      encoding="utf-8") as file:
                svg = file.read()

                # Remove XML declaration
                svg = regex.sub(r"<\?xml.+?\?>", "", svg)

                output_xhtml = regex.sub(
                    fr"<img.+?src=\"\.\./images/{match}\.svg\".*?/>", svg,
                    output_xhtml)

        # All done, clean the output
        output_xhtml = se.formatting.format_xhtml(output_xhtml)

        # Insert our CSS. We do this after `clean` because `clean` will escape > in the CSS
        output_xhtml = regex.sub(r"<style/>",
                                 "<style>\n\t\t\t" + css + "\t\t</style>",
                                 output_xhtml)

        if output_xhtml5:
            output_xhtml = output_xhtml.replace(
                "\t\t<meta charset=\"utf-8\"/>\n", "")
            output_xhtml = output_xhtml.replace("\t\t<style/>\n", "")
        else:
            # Remove xml declaration and re-add the doctype
            output_xhtml = regex.sub(r"<\?xml.+?\?>", "<!doctype html>",
                                     output_xhtml)
            output_xhtml = regex.sub(r" epub:prefix=\".+?\"", "", output_xhtml)

            # Make some replacements for HTML5 compatibility
            output_xhtml = output_xhtml.replace("epub:type", "data-epub-type")
            output_xhtml = output_xhtml.replace("epub|type", "data-epub-type")
            output_xhtml = output_xhtml.replace("xml:lang", "lang")
            output_xhtml = output_xhtml.replace(
                "<html",
                f"<html lang=\"{metadata_soup.find('dc:language').string}\"")
            output_xhtml = regex.sub(" xmlns.+?=\".+?\"", "", output_xhtml)

        return output_xhtml