Python Regex.Matches 예제들, System.Text.RegularExpressions.Regex.Matches Python 예제들

예제 #1

0

파일 보기

파일: webcomichelper.py 프로젝트: Stonepaw/comicrack-webcomic-helper

def check_created_image_regex(first_image_url, second_image_url, regex):
    matches = Regex.Matches(first_page_source._source, regex,
                            RegexOptions.IgnoreCase)

    escaped_first_image_url = escape_uri_string(first_image_url)
    escaped_second_image_url = escape_uri_string(second_image_url)

    if matches.Count == 0:
        return False, 0
    else:
        if debug:
            print "\nFound " + str(
                matches.Count
            ) + " match(es) on the first page with regex: " + regex
            print "Captured: " + matches[0].Value
            print "link group: " + matches[0].Groups["link"].Value

        #We don't care if there is more than one result. As long as the first result is the correct image
        result, image_uri = Uri.TryCreate(first_page_uri,
                                          matches[0].Groups["link"].Value)

        #Valid url and matches the input image url
        if result and image_uri.AbsoluteUri in (first_image_url,
                                                escaped_first_image_url):
            if debug: print "Valid uri and matches image url"

        else:
            if debug: print "Not a valid uri or doesn't match image url"
            return False, 0

        matches_second = Regex.Matches(second_page_source._source, regex,
                                       RegexOptions.IgnoreCase)

        if matches_second.Count == 0:
            if debug: print "No matches on the second page"
            return False, 0

        #Regex match on the second page. Same deal as above. We don't care if there is more than one result
        if debug:
            print "\nFound " + str(
                matches_second.Count) + " match(es) on the second page"
            print "Captured: " + matches_second[0].Value
            print "link group: " + matches_second[0].Groups["link"].Value

        result, image_uri = Uri.TryCreate(
            second_page_uri, matches_second[0].Groups["link"].Value)

        if result and image_uri.AbsoluteUri in (second_image_url,
                                                escaped_second_image_url):

            if debug:
                print "Regex works on both pages and returns the correct image"
            return True, matches.Count

        else:
            if debug: print "Invalid Uri or doesn't match the second image url"
            return False, 0

    return False, matches.Count

예제 #2

0

파일 보기

def browserEnum():
    summary = printHeader("BROWSER ENUM")
    regex = Regex('(http|ftp|https|file)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')

    #Active IE Urls
    summary += printSubheader("ACTIVE EXPLORER URLS")
    app = Activator.CreateInstance(Type.GetTypeFromProgID("Shell.Application"))
    summary += "\n".join([w.LocationUrl() for w in app.Windows()])

    #Chrome History
    summary += printSubheader("\n\nChrome History")
    try:
        cHistPath = "{0}\Users\{1}\AppData\Local\Google\Chrome\User Data\Default\History".format(Env.GetEnvironmentVariable("systemdrive"), Env.UserName)
        cHist = open(cHistPath, "r").read()
        summary += "\n".join(["[*] {0}\n".format(m.Value) for m in regex.Matches(cHist)][-10:])
    except:
        pass

    summary += printSubheader("\nChrome Bookmarks")
    #Chrome Bookmarks
    try:
        cBMPath = "{0}\Users\{1}\AppData\Local\Google\Chrome\User Data\Default\Bookmarks".format(Env.GetEnvironmentVariable("systemdrive"), Env.UserName)
        js = JavaScriptSerializer()
        cBM = js.DeserializeObject(open(cBMPath, "r").read())
        urls = cBM["roots"]["bookmark_bar"]["children"]
        for url in urls:
            u = url['url']
            d = url['name']
            summary += "[*] {0}\n{1}\n\n".format(d, u)
    except:
        pass

    summary += printSubheader("Firefox History")
    #Firefox History
    try:
        regex = Regex('(http|ftp|https|file)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
        fHistPath = "{0}\Users\{1}\AppData\Roaming\Mozilla\Firefox\Profiles".format(Env.GetEnvironmentVariable("systemdrive"), Env.UserName)
        for path in DirectoryInfo(fHistPath).EnumerateDirectories("*.default"):
            places = open(path.FullName + "\places.sqlite", "r").read()
            summary += "\n".join(["[*] {0}\n".format(m.Value) for m in regex.Matches(places)][:10])
    except:
        pass

    summary += printSubheader("IE History")
    typedUrlPath = "\Software\Microsoft\Internet Explorer\TypedURLs"
    for sid in Registry.Users.GetSubKeyNames():
        if sid != ".DEFAULT" and not sid.endswith("Classes"):
            try:
                typedUrlsKey = Registry.Users.OpenSubKey(sid + typedUrlPath)
                if typedUrlsKey != None:
                    summary += "[{0}][{1}]\n".format(sid, SecurityIdentifier(sid.ToString()).Translate(NTAccount))
                    for value in typedUrlsKey.GetValueNames():
                        summary += "\t{0}\n".format(typedUrlsKey.GetValue(value))
                summary += "\n"
            except SystemError:
                pass

    return summary

예제 #3

0

파일 보기

파일: webcomichelper.py 프로젝트: Stonepaw/comicrack-webcomic-helper

def check_regex_against_source(regex, check_value, check_value2):

    matches = Regex.Matches(first_page_source._source, regex._regex,
                            RegexOptions.IgnoreCase)

    if matches.Count != 0:

        if debug:
            print "\n\nFound " + str(
                matches.Count
            ) + " match(es) on the first page with regex: " + regex._regex
            print "Captured: " + matches[0].Value
            print "link group: " + matches[0].Groups["link"].Value

        #We don't care if there is more than one result. As long as the first result is the correct image
        result, result_uri = Uri.TryCreate(first_page_uri,
                                           matches[0].Groups["link"].Value)

        if result and result_uri.AbsoluteUri in (check_value, check_value2):

            #Valid url and matches against the check_value

            if debug: print "Valid uri"

        else:
            return False

        matches_second = Regex.Matches(second_page_source._source,
                                       regex._regex, RegexOptions.IgnoreCase)

        if matches_second.Count == 0:
            if debug: print "No matches found on the second page"
            return False

        #Regex match on the second page. Same deal as above. We don't care if there is more than one result

        if debug:
            print "\nFound " + str(
                matches_second.Count) + " match(es) on the second page"
            print "Captured: " + matches_second[0].Value
            print "link group: " + matches_second[0].Groups["link"].Value

        result, result_uri = Uri.TryCreate(
            second_page_uri, matches_second[0].Groups["link"].Value)

        if result:
            regex._matches = matches.Count
            if debug: print "Added to valid regex"
            return regex

        else:
            if debug: print "Invalid uri"
            return False

    return False

예제 #4

0

파일 보기

파일: ScanInformationFromFilename.py 프로젝트: Stonepaw/comicrack-scan-information-from-filename

def FindScanners(worker, books):

    #Load the various settings. settings is a dict
    settings = LoadSettings()

    #Load the scanners
    unformatedscanners = LoadListFromFile(SCANNERSFILE)

    #Sort the scanners by length and reverse it. For example cl will come after clickwheel allowing them to be matched correctly.
    unformatedscanners.sort(key=len, reverse=True)

    #Format the scanners for use in the regex
    scanners = "|".join(unformatedscanners)
    scanners = "(?<Tags>" + scanners + ")"

    #Load the blacklist and format it
    blacklist = LoadListFromFile(BLACKLISTFILE)

    blacklist.extend(LoadUserBlackListFromFile(USERBLACKLISTFILE))

    formatedblacklist = "|".join(blacklist)

    #Add in the blacklist

    #These amazing regex are designed by the amazing Helmic.

    pattern = r"(?:(?:__(?!.*__[^_]))|[(\[])(?!(?:" + formatedblacklist + r"|[\s_\-\|/,])+[)\]])(?<Tags>(?=[^()\[\]]*[^()\[\]\W\d_])[^()\[\]]{2,})[)\]]?"

    replacePattern = r"(?:[^\w]|_|^)(?:" + formatedblacklist + r")(?:[^\w]|_|$)"

    #Create the regex

    regex = Regex(pattern, RegexOptions.IgnoreCase)
    regexScanners = Regex(scanners, RegexOptions.IgnoreCase)
    regexReplace = Regex(replacePattern, RegexOptions.IgnoreCase)

    ComicBookFields = ComicRack.App.GetComicFields()
    ComicBookFields.Remove("Scan Information")
    ComicBookFields.Add("Language", "LanguageAsText")

    for book in books:

        #.net Regex
        #Note that every possible match is found and then the last one is used.
        #This is because in some rare cases more than one thing is mistakenly matched and the scanner is almost always the last match.
        matches = regex.Matches(book.FileName)
        unknowntag = ""

        try:
            match = matches[matches.Count - 1]

        except ValueError:

            #No match
            #print "Trying the Scanners.txt list"

            #Check the defined scanner names
            match = regexScanners.Match(book.FileName)

            #Still no match
            if match.Success == False:
                if settings["Unknown"] != "":
                    unknowntag = settings["Unknown"]
                else:
                    continue

        #Check if what was grabbed is a field in the comic
        fields = []
        for field in ComicBookFields.Values:
            fields.append(unicode(getattr(book, field)).lower())

        if match.Groups["Tags"].Value.lower() in fields:
            print "Uh oh. That matched tag is in the info somewhere."
            newmatch = False
            for n in reversed(range(0, matches.Count - 1)):
                if not matches[n].Groups["Tags"].Value.lower() in fields:
                    match = matches[n]
                    newmatch = True
                    break
            if newmatch == False:
                if settings["Unknown"] != "":
                    unknowntag = settings["Unknown"]
                else:
                    continue

        #Check if the match can be found in () in the series, title or altseries
        titlefields = [
            book.ShadowSeries, book.ShadowTitle, book.AlternateSeries
        ]
        abort = False
        for title in titlefields:
            titleresult = re.search("\((?P<match>.*)\)", title)
            if titleresult != None and titleresult.group(
                    "match").lower() == match.Groups["Tags"].Value.lower():
                #The match is part of the title, series or altseries so skip it
                print "The match is part of the title, series or altseries"
                abort = True
                break
        if abort == True:
            if settings["Unknown"] != "":
                unknowntag = settings["Unknown"]
            else:
                continue

        #Get a list of the old ScanInformation
        oldtags = book.ScanInformation
        ListOfTagsTemp = oldtags.split(",")
        if '' in ListOfTagsTemp:
            ListOfTagsTemp.remove('')

        ListOfTags = []
        if ListOfTagsTemp != []:
            for indtag in ListOfTagsTemp:
                ListOfTags.append(indtag.strip())

        #Create our new tag
        if unknowntag != "":
            newtag = settings["Prefix"] + unknowntag
        else:
            newtag = settings["Prefix"] + regexReplace.Replace(
                match.Groups["Tags"].Value.strip("_, "), "")

        if newtag not in ListOfTags:
            ListOfTags.append(newtag)

        #Sort alphabeticaly to be neat
        ListOfTags.sort()

        #Add to ScanInformation field
        book.ScanInformation = ", ".join(ListOfTags)