Exemple #1
0
    def __init__(self, row, isUtf8=False):
        self.row = row
        assert not fInvalidRedirect(row)
        self.md5Hash = None
        txt = row[CUR_TEXT]
        if isUtf8:
            title = utf8ToLatin1(self.row[CUR_TITLE])
            self.row[CUR_TITLE] = entities.convertNumberedEntities(title, title)
            txt = utf8ToLatin1(txt)

        # redirectNum = int(row[CUR_IS_REDIRECT])
        # assert redirectNum==0 or redirectNum==1
        self.redirect = None
        # we pretty much ignore the CUR_IS_REDIRECT setting
        # if it's marked as redirect and is not, we throw it out
        # before coming here
        # if it's not marked as redirect but looks like redirect
        # we treat it as redirect anyway
        redirect = getRedirectFromText(txt)
        if redirect:
            self.redirect = entities.convertNumberedEntities(self.row[CUR_TITLE], redirect.replace(" ", "_"))
            if int(row[CUR_IS_REDIRECT]) == 0:
                # redirect not marked as such
                print "%s is a redirect but not marked as such" % self.getTitle()
        else:
            self.row[CUR_TEXT] = txt.strip()
Exemple #2
0
    def __init__(self, row, isUtf8=False):
        self.row = row
        assert not fInvalidRedirect(row)
        self.md5Hash = None
        txt = row[CUR_TEXT]
        if isUtf8:
            title = utf8ToLatin1(self.row[CUR_TITLE])
            self.row[CUR_TITLE] = entities.convertNumberedEntities(
                title, title)
            txt = utf8ToLatin1(txt)

        #redirectNum = int(row[CUR_IS_REDIRECT])
        #assert redirectNum==0 or redirectNum==1
        self.redirect = None
        # we pretty much ignore the CUR_IS_REDIRECT setting
        # if it's marked as redirect and is not, we throw it out
        # before coming here
        # if it's not marked as redirect but looks like redirect
        # we treat it as redirect anyway
        redirect = getRedirectFromText(txt)
        if redirect:
            self.redirect = entities.convertNumberedEntities(
                self.row[CUR_TITLE], redirect.replace(" ", "_"))
            if int(row[CUR_IS_REDIRECT]) == 0:
                # redirect not marked as such
                print "%s is a redirect but not marked as such" % self.getTitle(
                )
        else:
            self.row[CUR_TEXT] = txt.strip()
Exemple #3
0
def parse411ReversePhoneLookup(htmlTxt):
    result = []
    if -1 != htmlTxt.find("We did not find a listing"):
        print "parse411ReversePhoneLookup: no results"
        return (NO_RESULTS, None)
    soup = BeautifulSoup21.BeautifulSoup(htmlTxt)
    div = soup.first("div", {"id" : "subtext"})
    if not div:
        print "parse411ReversePhoneLookup: no div"
        return (UNKNOWN_FORMAT, None)

    nameTxt = div.span.strong.string
    if not nameTxt:
        print "parse411ReversePhoneLookup: no name"
        return (UNKNOWN_FORMAT, None)
    nameTxt = convertNumberedEntities(nameTxt)
    nameTxt = convertNamedEntities(nameTxt)
    nameTxt = cleanupName(nameTxt)
    #print nameTxt

    addr = div.next.next.next.next.next.next
    addressTxt = addr.string
    #print addressTxt

    city = addr.next.next
    cityTxt = city.string
    #print cityTxt

    phone = city.next.next
    phoneTxt = phone.string
    #print phoneTxt

    result.append([nameTxt, addressTxt, cityTxt, phoneTxt])
    return (RESULTS_DATA, universalDataFormatReplaceEntities(result))
Exemple #4
0
def personSearch2(htmlTxt):
    returned = []
    soup = BeautifulSoup21.BeautifulSoup(htmlTxt)

    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)

    divTag = soup.first("div", {"id" : "subtext"})
    if not divTag:
        print "No divTag"
        return (UNKNOWN_FORMAT, None)
    #print "Found divTag"
    #print divTag
    #print
    #print divTag.span
    #print
    #print divTag.span.strong
    spanTag = divTag.span
    strongTag = divTag.span.strong
    nameStr = convertNumberedEntities(strongTag.string.strip()) #.strip() #.string.strip()
    print nameStr
    addrStreetTag = spanTag.nextSibling
    addrStreet = addrStreetTag.string.strip()
    print addrStreet
    addrCityTag = addrStreetTag.nextSibling.nextSibling
    addrCity = addrCityTag.string.strip()
    print addrCity
    addrPhoneTag = addrCityTag.nextSibling.nextSibling
    addrPhone = addrPhoneTag.string.strip()
    print addrPhone
    #print strongTag.nextSibling
    #print strongTag.nextSibling.nextSibling
    return (UNKNOWN_FORMAT, None)
Exemple #5
0
def parse411ReversePhoneLookup(htmlTxt):
    result = []
    if -1 != htmlTxt.find("We did not find a listing"):
        print "parse411ReversePhoneLookup: no results"
        return (NO_RESULTS, None)
    soup = BeautifulSoup21.BeautifulSoup(htmlTxt)
    div = soup.first("div", {"id": "subtext"})
    if not div:
        print "parse411ReversePhoneLookup: no div"
        return (UNKNOWN_FORMAT, None)

    nameTxt = div.span.strong.string
    if not nameTxt:
        print "parse411ReversePhoneLookup: no name"
        return (UNKNOWN_FORMAT, None)
    nameTxt = convertNumberedEntities(nameTxt)
    nameTxt = convertNamedEntities(nameTxt)
    nameTxt = cleanupName(nameTxt)
    # print nameTxt

    addr = div.next.next.next.next.next.next
    addressTxt = addr.string
    # print addressTxt

    city = addr.next.next
    cityTxt = city.string
    # print cityTxt

    phone = city.next.next
    phoneTxt = phone.string
    # print phoneTxt

    result.append([nameTxt, addressTxt, cityTxt, phoneTxt])
    return (RESULTS_DATA, universalDataFormatReplaceEntities(result))
Exemple #6
0
def personSearch2(htmlTxt):
    returned = []
    soup = BeautifulSoup21.BeautifulSoup(htmlTxt)

    res = testNoResults(soup)
    if RESULTS_DATA != res:
        return (res, None)

    divTag = soup.first("div", {"id": "subtext"})
    if not divTag:
        print "No divTag"
        return (UNKNOWN_FORMAT, None)
    # print "Found divTag"
    # print divTag
    # print
    # print divTag.span
    # print
    # print divTag.span.strong
    spanTag = divTag.span
    strongTag = divTag.span.strong
    nameStr = convertNumberedEntities(strongTag.string.strip())  # .strip() #.string.strip()
    print nameStr
    addrStreetTag = spanTag.nextSibling
    addrStreet = addrStreetTag.string.strip()
    print addrStreet
    addrCityTag = addrStreetTag.nextSibling.nextSibling
    addrCity = addrCityTag.string.strip()
    print addrCity
    addrPhoneTag = addrCityTag.nextSibling.nextSibling
    addrPhone = addrPhoneTag.string.strip()
    print addrPhone
    # print strongTag.nextSibling
    # print strongTag.nextSibling.nextSibling
    return (UNKNOWN_FORMAT, None)
Exemple #7
0
def universalDataFormatWithDefinition(definition, listOfLists):
    assert isinstance(listOfLists, list)
    header = ["%d" % (len(listOfLists) + 1)]
    serializedDefinition = definition.serialize()
    header.append("1 " + ("%d" % len(serializedDefinition)))
    results = ["D", serializedDefinition]
    for smallList in listOfLists:
        headerItem = []
        for item in smallList:
            item = convertNumberedEntities(item)
            item = convertNamedEntities(item)
            results.append(item)
            headerItem.append("%d" % len(item))
        header.append(string.join(headerItem, " "))
    return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
Exemple #8
0
def universalDataFormatReplaceEntities(listOfLists):
    assert isinstance(listOfLists, list)
    header = ["%d" % len(listOfLists)]
    results = []
    for smallList in listOfLists:
        headerItem = []
        for item in smallList:
            #remove entities
            item = convertNumberedEntities(item)
            item = convertNamedEntities(item)
            # add it to lists
            results.append(item)
            headerItem.append("%d" % len(item))
        header.append(string.join(headerItem, " "))
    return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
Exemple #9
0
def universalDataFormatWithDefinition(definition, listOfLists):
    assert isinstance(listOfLists, list)
    header = ["%d" % (len(listOfLists) + 1)]
    serializedDefinition = definition.serialize()
    header.append("1 " + ("%d" % len(serializedDefinition)))
    results = ["D", serializedDefinition]
    for smallList in listOfLists:
        headerItem = []
        for item in smallList:
            item = convertNumberedEntities(item)
            item = convertNamedEntities(item)
            results.append(item)
            headerItem.append("%d" % len(item))
        header.append(string.join(headerItem, " "))
    return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
Exemple #10
0
def universalDataFormatReplaceEntities(listOfLists):
    assert isinstance(listOfLists, list)
    header = ["%d" % len(listOfLists)]
    results = []
    for smallList in listOfLists:
        headerItem = []
        for item in smallList:
            # remove entities
            item = convertNumberedEntities(item)
            item = convertNamedEntities(item)
            # add it to lists
            results.append(item)
            headerItem.append("%d" % len(item))
        header.append(string.join(headerItem, " "))
    return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
Exemple #11
0
def convertArticle(term, text):
    try:
        text = text.replace('__NOTOC__', '')
        text = fixSup2(text)
        text = removeImageRx(text)
        # remove categories. TODO: provide a better support for categories
        # i.e. we remember categories on the server and client can display
        # all articles in a given category
        #        text=replaceRegExp(text, categoryRe, '')
        text = replaceWikiMacros(text)
        # remove remaining templates. TODO: better support for templates
        # in wikipedia template text is replaced by a page from Template:
        # namespace
        text = replaceRegExp(text, wikiTemplateRe, '')
        text = text.replace('\r', '')
        text = replaceRegExp(
            text, commentRe, ''
        )  # This should be safe, as it's illegal in html to nest comments

        text = stripTagBlocks(text, 'div')
        text = stripTagBlocks(text, 'table')
        text = stripBlocks(text, r'\{\|', r'\|\}')

        text = replaceRegExp(text, scriptRe, '')

        text = replaceTagList(text, ['b', 'strong'], "'''")
        text = replaceTagList(text, ['em', 'i', 'cite'], "''")
        text = replaceTagList(text, ['hr'], '----')
        text = replaceTagList(text, ['p'], '<br>')
        text = replaceTagList(text, [
            'dfn', 'code', 'samp', 'kbd', 'var', 'abbr', 'acronym',
            'blockquote', 'q', 'pre', 'ins', 'del', 'dir', 'menu', 'img',
            'object', 'big', 'span', 'applet', 'font', 'basefont', 'tr', 'td',
            'table', 'center', 'div'
        ], '')
        text = replaceRegExp(text, badLinkRe, '', supportedLanguagesRe())
        text = entities.convertNamedEntities(term, text)
        text = entities.convertNumberedEntities(term, text)
        text = stripMultipleNewLines(text)
        text = text.strip()
        text += '\n'
        return text
    except Exception, ex:
        print "Exception while converting term: ", term
        print arsutils.exceptionAsStr(ex)
        return ''
Exemple #12
0
def convertArticle(term, text):
    try:
        text = text.replace("__NOTOC__", "")
        text = fixSup2(text)
        text = removeImageRx(text)
        # remove categories. TODO: provide a better support for categories
        # i.e. we remember categories on the server and client can display
        # all articles in a given category
        #        text=replaceRegExp(text, categoryRe, '')
        text = replaceWikiMacros(text)
        # remove remaining templates. TODO: better support for templates
        # in wikipedia template text is replaced by a page from Template:
        # namespace
        text = replaceRegExp(text, wikiTemplateRe, "")
        text = text.replace("\r", "")
        text = replaceRegExp(text, commentRe, "")  # This should be safe, as it's illegal in html to nest comments

        text = stripTagBlocks(text, "div")
        text = stripTagBlocks(text, "table")
        text = stripBlocks(text, r"\{\|", r"\|\}")

        text = replaceRegExp(text, scriptRe, "")

        text = replaceTagList(text, ["b", "strong"], "'''")
        text = replaceTagList(text, ["em", "i", "cite"], "''")
        text = replaceTagList(text, ["hr"], "----")
        text = replaceTagList(text, ["p"], "<br>")
        text = replaceTagList(
            text,
            [
                "dfn",
                "code",
                "samp",
                "kbd",
                "var",
                "abbr",
                "acronym",
                "blockquote",
                "q",
                "pre",
                "ins",
                "del",
                "dir",
                "menu",
                "img",
                "object",
                "big",
                "span",
                "applet",
                "font",
                "basefont",
                "tr",
                "td",
                "table",
                "center",
                "div",
            ],
            "",
        )
        text = replaceRegExp(text, badLinkRe, "", supportedLanguagesRe())
        text = entities.convertNamedEntities(term, text)
        text = entities.convertNumberedEntities(term, text)
        text = stripMultipleNewLines(text)
        text = text.strip()
        text += "\n"
        return text
    except Exception, ex:
        print "Exception while converting term: ", term
        print arsutils.exceptionAsStr(ex)
        return ""