コード例 #1
0
def parse411ReversePhoneLookup(htmlTxt):
    result = []
    if -1 != htmlTxt.find("We did not find a listing"):
        print "parse411ReversePhoneLookup: no results"
        return (NO_RESULTS, None)
    soup = BeautifulSoup21.BeautifulSoup(htmlTxt)
    div = soup.first("div", {"id" : "subtext"})
    if not div:
        print "parse411ReversePhoneLookup: no div"
        return (UNKNOWN_FORMAT, None)

    nameTxt = div.span.strong.string
    if not nameTxt:
        print "parse411ReversePhoneLookup: no name"
        return (UNKNOWN_FORMAT, None)
    nameTxt = convertNumberedEntities(nameTxt)
    nameTxt = convertNamedEntities(nameTxt)
    nameTxt = cleanupName(nameTxt)
    #print nameTxt

    addr = div.next.next.next.next.next.next
    addressTxt = addr.string
    #print addressTxt

    city = addr.next.next
    cityTxt = city.string
    #print cityTxt

    phone = city.next.next
    phoneTxt = phone.string
    #print phoneTxt

    result.append([nameTxt, addressTxt, cityTxt, phoneTxt])
    return (RESULTS_DATA, universalDataFormatReplaceEntities(result))
コード例 #2
0
ファイル: m411_by411.py プロジェクト: kjk/moriarty-palm
def parse411ReversePhoneLookup(htmlTxt):
    result = []
    if -1 != htmlTxt.find("We did not find a listing"):
        print "parse411ReversePhoneLookup: no results"
        return (NO_RESULTS, None)
    soup = BeautifulSoup21.BeautifulSoup(htmlTxt)
    div = soup.first("div", {"id": "subtext"})
    if not div:
        print "parse411ReversePhoneLookup: no div"
        return (UNKNOWN_FORMAT, None)

    nameTxt = div.span.strong.string
    if not nameTxt:
        print "parse411ReversePhoneLookup: no name"
        return (UNKNOWN_FORMAT, None)
    nameTxt = convertNumberedEntities(nameTxt)
    nameTxt = convertNamedEntities(nameTxt)
    nameTxt = cleanupName(nameTxt)
    # print nameTxt

    addr = div.next.next.next.next.next.next
    addressTxt = addr.string
    # print addressTxt

    city = addr.next.next
    cityTxt = city.string
    # print cityTxt

    phone = city.next.next
    phoneTxt = phone.string
    # print phoneTxt

    result.append([nameTxt, addressTxt, cityTxt, phoneTxt])
    return (RESULTS_DATA, universalDataFormatReplaceEntities(result))
コード例 #3
0
def universalDataFormatWithDefinition(definition, listOfLists):
    assert isinstance(listOfLists, list)
    header = ["%d" % (len(listOfLists) + 1)]
    serializedDefinition = definition.serialize()
    header.append("1 " + ("%d" % len(serializedDefinition)))
    results = ["D", serializedDefinition]
    for smallList in listOfLists:
        headerItem = []
        for item in smallList:
            item = convertNumberedEntities(item)
            item = convertNamedEntities(item)
            results.append(item)
            headerItem.append("%d" % len(item))
        header.append(string.join(headerItem, " "))
    return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
コード例 #4
0
def universalDataFormatReplaceEntities(listOfLists):
    assert isinstance(listOfLists, list)
    header = ["%d" % len(listOfLists)]
    results = []
    for smallList in listOfLists:
        headerItem = []
        for item in smallList:
            #remove entities
            item = convertNumberedEntities(item)
            item = convertNamedEntities(item)
            # add it to lists
            results.append(item)
            headerItem.append("%d" % len(item))
        header.append(string.join(headerItem, " "))
    return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
コード例 #5
0
ファイル: parserUtils.py プロジェクト: kjk/moriarty-palm
def universalDataFormatWithDefinition(definition, listOfLists):
    assert isinstance(listOfLists, list)
    header = ["%d" % (len(listOfLists) + 1)]
    serializedDefinition = definition.serialize()
    header.append("1 " + ("%d" % len(serializedDefinition)))
    results = ["D", serializedDefinition]
    for smallList in listOfLists:
        headerItem = []
        for item in smallList:
            item = convertNumberedEntities(item)
            item = convertNamedEntities(item)
            results.append(item)
            headerItem.append("%d" % len(item))
        header.append(string.join(headerItem, " "))
    return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
コード例 #6
0
ファイル: parserUtils.py プロジェクト: kjk/moriarty-palm
def universalDataFormatReplaceEntities(listOfLists):
    assert isinstance(listOfLists, list)
    header = ["%d" % len(listOfLists)]
    results = []
    for smallList in listOfLists:
        headerItem = []
        for item in smallList:
            # remove entities
            item = convertNumberedEntities(item)
            item = convertNamedEntities(item)
            # add it to lists
            results.append(item)
            headerItem.append("%d" % len(item))
        header.append(string.join(headerItem, " "))
    return "%s\n%s " % (string.join(header, "\n"), string.join(results, " "))
コード例 #7
0
def convertArticle(term, text):
    try:
        text = text.replace('__NOTOC__', '')
        text = fixSup2(text)
        text = removeImageRx(text)
        # remove categories. TODO: provide a better support for categories
        # i.e. we remember categories on the server and client can display
        # all articles in a given category
        #        text=replaceRegExp(text, categoryRe, '')
        text = replaceWikiMacros(text)
        # remove remaining templates. TODO: better support for templates
        # in wikipedia template text is replaced by a page from Template:
        # namespace
        text = replaceRegExp(text, wikiTemplateRe, '')
        text = text.replace('\r', '')
        text = replaceRegExp(
            text, commentRe, ''
        )  # This should be safe, as it's illegal in html to nest comments

        text = stripTagBlocks(text, 'div')
        text = stripTagBlocks(text, 'table')
        text = stripBlocks(text, r'\{\|', r'\|\}')

        text = replaceRegExp(text, scriptRe, '')

        text = replaceTagList(text, ['b', 'strong'], "'''")
        text = replaceTagList(text, ['em', 'i', 'cite'], "''")
        text = replaceTagList(text, ['hr'], '----')
        text = replaceTagList(text, ['p'], '<br>')
        text = replaceTagList(text, [
            'dfn', 'code', 'samp', 'kbd', 'var', 'abbr', 'acronym',
            'blockquote', 'q', 'pre', 'ins', 'del', 'dir', 'menu', 'img',
            'object', 'big', 'span', 'applet', 'font', 'basefont', 'tr', 'td',
            'table', 'center', 'div'
        ], '')
        text = replaceRegExp(text, badLinkRe, '', supportedLanguagesRe())
        text = entities.convertNamedEntities(term, text)
        text = entities.convertNumberedEntities(term, text)
        text = stripMultipleNewLines(text)
        text = text.strip()
        text += '\n'
        return text
    except Exception, ex:
        print "Exception while converting term: ", term
        print arsutils.exceptionAsStr(ex)
        return ''
コード例 #8
0
ファイル: articleconvert.py プロジェクト: kjk/ipedia-palm
def convertArticle(term, text):
    try:
        text = text.replace("__NOTOC__", "")
        text = fixSup2(text)
        text = removeImageRx(text)
        # remove categories. TODO: provide a better support for categories
        # i.e. we remember categories on the server and client can display
        # all articles in a given category
        #        text=replaceRegExp(text, categoryRe, '')
        text = replaceWikiMacros(text)
        # remove remaining templates. TODO: better support for templates
        # in wikipedia template text is replaced by a page from Template:
        # namespace
        text = replaceRegExp(text, wikiTemplateRe, "")
        text = text.replace("\r", "")
        text = replaceRegExp(text, commentRe, "")  # This should be safe, as it's illegal in html to nest comments

        text = stripTagBlocks(text, "div")
        text = stripTagBlocks(text, "table")
        text = stripBlocks(text, r"\{\|", r"\|\}")

        text = replaceRegExp(text, scriptRe, "")

        text = replaceTagList(text, ["b", "strong"], "'''")
        text = replaceTagList(text, ["em", "i", "cite"], "''")
        text = replaceTagList(text, ["hr"], "----")
        text = replaceTagList(text, ["p"], "<br>")
        text = replaceTagList(
            text,
            [
                "dfn",
                "code",
                "samp",
                "kbd",
                "var",
                "abbr",
                "acronym",
                "blockquote",
                "q",
                "pre",
                "ins",
                "del",
                "dir",
                "menu",
                "img",
                "object",
                "big",
                "span",
                "applet",
                "font",
                "basefont",
                "tr",
                "td",
                "table",
                "center",
                "div",
            ],
            "",
        )
        text = replaceRegExp(text, badLinkRe, "", supportedLanguagesRe())
        text = entities.convertNamedEntities(term, text)
        text = entities.convertNumberedEntities(term, text)
        text = stripMultipleNewLines(text)
        text = text.strip()
        text += "\n"
        return text
    except Exception, ex:
        print "Exception while converting term: ", term
        print arsutils.exceptionAsStr(ex)
        return ""