Beispiel #1
0
 def check_headline(headline):
   headline = HTMLParser().unescape(headline)
   headline = headline.lower()
   tokens = nltk.word_tokenize(headline)
   tagged = NLPParser.t3.tag(tokens)
   entities = nltk.ne_chunk(tagged, binary=True)
   parsed = NLPParser.cp.parse(entities)
   for node in parsed:
     if type(node) is nltk.Tree and node.label() == 'SBAR':
       sentence = node.leaves()
       sentence = ' '.join([t[0] for t in sentence])
       return sentence, 'sentence'
   tag_sequence = [t[1] for t in tagged]
   for node in parsed:
     if type(node) is nltk.Tree and node.label() == 'NE':
       subject = node.leaves()
       subject = ' '.join([t[0] for t in subject])
       return subject, 'topic'
   np_len = 0
   np_topic = []
   for node in parsed:
     if type(node) is nltk.Tree and node.label() == 'NP':
       np = node.leaves()
       if len(np) >= np_len and 'PP' not in np:
         np_len = len(np)
         np_topic = np
   if np_topic:
     np_topic = ' '.join([t[0] for t in np_topic])
     return np_topic, 'topic'
   else:
     for t in tag_sequence:
       if re.match(r'VB.*', t):
         return NLPParser.simple_headline(headline), 'sentence'
     return NLPParser.simple_headline(headline), 'topic'
Beispiel #2
0
def uncode_name(
    name
):  # convert all the &# codes to char, remove extra-space and normalize
    from HTMLParser import HTMLParser
    name = name.replace('<![CDATA[', '').replace(']]', '')
    name = HTMLParser().unescape(name.lower())
    return name
Beispiel #3
0
def cleanUpMatch(match):
    # remove everything between double line breaks
    match = re.sub(r'\n\s*\n.*?\n\s*\n', '', match, flags=re.DOTALL)

    # remove soft hyphen
    match = re.sub(r'&#xAD;(\n)*(<lb/>)*', '', match)

    # remove box character
    match = re.sub(r'\xc2\xad(\n)*(<lb/>)*', '', match)

    # replace <lb/> or '\n' with ' '
    match = re.sub(r'<lb/>|\n', ' ', match)

    # remove everything between '<>'
    match = re.sub(r'<.*?>', '', match)

    # translate special characters
    for char in TRANSLATIONS:
        match = match.replace(char, TRANSLATIONS[char])

    # remove extra white spaces
    match = ' '.join(match.split())

    # decode html characters
    match = HTMLParser().unescape(match)

    # always normalize to lower case
    return match.lower()
    def un_code_name(name):
        """
        Convert all the &# codes to char, remove extra-space and normalize
        :param name: string to convert
        :type name: str
        :return: converted string
        """
        from HTMLParser import HTMLParser

        name = name.replace('<![CDATA[', '').replace(']]', '')
        name = HTMLParser().unescape(name.lower())
        return name
Beispiel #5
0
def normalize_string(string, charset=None, replacing=False):
    """
    Decode and Convert to Unicode any string
    :param charset: encoding
    :type charset: str
    :param string: string to convert
    :type string: str or unicode
    :param replacing: Whether is ' is replaced
    :type replacing: bool
    :return: converted unicode
    :rtype: unicode
    """
    if not isinstance(string, unicode):
        try:
            if re.search(u'=[0-9a-fA-F]{2}', string):
                string = string.decode('Quoted-printable')

            string = json.loads(u'%s' % string, encoding=charset)

        except ValueError:
            try:
                string = unicode(eval(string), 'raw_unicode_escape')

            except (SyntaxError, NameError):
                string = string.decode('latin-1')
                pass

            except TypeError:
                string = unicode(string, errors='ignore')
                pass

        except LookupError:
            return u''

        except TypeError:
            string = unicode(string, errors='ignore')
            pass

    string = remove_control_chars(string)
    string = fix_bad_unicode(string)
    string = unquote(string)
    string = string.replace(u'<![CDATA[', u'').replace(u']]', u'')
    string = HTMLParser().unescape(string)
    if replacing:
        string = string.replace(u"'", '')

    string = string.lower()

    return string
Beispiel #6
0
def uncodeName(name):  # Convert all the &# codes to char, remove extra-space and normalize
    from HTMLParser import HTMLParser
    name = name.replace('<![CDATA[', '').replace(']]', '')
    name = HTMLParser().unescape(name.lower())
    return name
Beispiel #7
0
    def uncode_name(name):  # convert all the &# codes to char, remove extra-space and normalize
        from HTMLParser import HTMLParser

        name = name.replace("<![CDATA[", "").replace("]]", "")
        name = HTMLParser().unescape(name.lower())
        return name
Beispiel #8
0
def populate_restaurants(c):
    print 'Populating Restaurants table...'

    if not (os.access('restaurants', os.R_OK)
            and os.path.isdir('restaurants')):
        print >> sys.stderr, "Error: cannot access raw data directory 'restaurants'"
        sys.exit(1)

    if not (os.access('suburbs.txt', os.R_OK)
            and os.path.isfile('suburbs.txt')):
        print >> sys.stderr, "Error: cannot access raw data file 'suburbs.txt'"
        sys.exit(1)

    #get postcodes from file and cache in dict
    suburbs = open('suburbs.txt').readlines()
    postcodes = {}
    for suburb in suburbs:
        lat, lng, pst, sub = suburb.strip().split('\t')
        postcodes[sub] = pst
    postcodes['CBD'] = 2000  #special case not in data file

    users = c.execute('SELECT username FROM Users').fetchall()
    num_users = c.execute('SELECT COUNT(*) FROM Users').fetchone()[0]

    i = 0
    for restaurant in glob.glob('restaurants/*'):
        r = open(restaurant).readlines()

        #extract info from file
        try:
            name = r[0].strip()
            name = HTMLParser().unescape(name)
            address = r[1].strip()
            address = HTMLParser().unescape(address)
            address = re.sub(r'nsw', 'NSW', address, flags=re.I)
            if not address.endswith(', NSW'):
                address = address + ', NSW'
            suburb = re.match(r'.*, (.+), Sydney', r[1]).group(1)
            suburb = HTMLParser().unescape(suburb)
            phone = r[2].strip().replace('(', '').replace(')', '')
            if re.match('Not available', phone):
                phone = 'Not provided'
            hours = r[3].strip()
            hours = re.sub(r'\s*,\s*', ', ', hours)
            hours = HTMLParser().unescape(hours)
            cuisine = r[4].strip()
            cuisine = HTMLParser().unescape(cuisine)
            cost = r[5].strip()
            image = r[6].strip()
        except:
            print >> sys.stderr, "Error: skipping '%s'" % restaurant
            continue

        #lookup postcode using suburb
        postcode = ''
        if not suburb in postcodes:
            continue
        else:
            postcode = postcodes[suburb]

        #and append it to the address
        address = address + ' ' + str(postcode)

        #chose a random protocol for the website
        protocol = 'http://'
        if random.randint(0, 1) == 1:
            protocol = 'https://'

        #make site of the form protocol://www.lowercase.name.of.restaurant.fake.com
        website = name.replace('  ', ' ').replace(' ', '.').replace(
            '-', '').strip() + '.fake.com'
        website = HTMLParser().unescape(website)
        website = urllib.quote(website)  #encode as url
        website = protocol + 'www.' + website  #avoid encoding the protocol
        website = website.lower().replace('..', '.')

        #ensure only some restaurants have owners
        owner = None
        if random.randint(0, 3) == 0:
            owner = users[random.randint(0, num_users - 1)][0]

        i += 1
        data = (i, name, suburb, address, postcode, phone, hours, cuisine,
                owner, website, cost, image)
        c.execute(
            '''INSERT INTO Restaurants
				(id, name, suburb, address, postcode, phone, hours, cuisine, owner, website, cost, image)
				VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data)
Beispiel #9
0
f = codecs.open('vk.txt', mode = 'a', encoding= 'utf-8')
f.write('--------------------\n')
f.write(u'количество клубов - ' + str(quantity_of_clubs) + u'\n')

i = 1
while i <= quantity_of_clubs:
    club_number = randrange(1000000, 30000000)
    if club_number not in visited_clubs:
        visited_clubs.add(club_number)
        data = urlopen('http://vk.com/club%s' % club_number).read().decode('utf8')
        title = HTMLParser().unescape(re.findall('<title>(.*)</title>', data)[0])
        if title not in ([u'Ошибка', u'Частная группа']):
            quantity = HTMLParser().unescape(re.findall(u'Участники\s+<em class="pm_counter">(\d+)</em>',data))
            if quantity:
                quantity_of_members.append(int(quantity[0]))
                words = title.lower().split(' ')
                for word in words:
                    # отбрасываем слова единичной длины и слова из списка
                    if len(word) != 1 and word not in words_to_exclude:
                        new_word = rx3.sub(u'-', (rx2.sub(u'', rx1.sub(u'',word)))) # удаляем все кроме букв и '-'
                        # отбрасываем пустые слова, слова единичной длины и слова из списка
                        if new_word != '' and len(new_word) != 1 and new_word not in words_to_exclude:
                            words_in_club_titles.append(new_word)
                i +=1


counts = Counter(words_in_club_titles)
top = counts.most_common(10)
for element in top:
    print "'%s', %d " %(element[0], element[1])
    f.write("'%s', %d " %(element[0], element[1]))
Beispiel #10
0
i = 1
while i <= quantity_of_clubs:
    club_number = randrange(1000000, 30000000)
    if club_number not in visited_clubs:
        visited_clubs.add(club_number)
        data = urlopen('http://vk.com/club%s' %
                       club_number).read().decode('utf8')
        title = HTMLParser().unescape(
            re.findall('<title>(.*)</title>', data)[0])
        if title not in ([u'Ошибка', u'Частная группа']):
            quantity = HTMLParser().unescape(
                re.findall(u'Участники\s+<em class="pm_counter">(\d+)</em>',
                           data))
            if quantity:
                quantity_of_members.append(int(quantity[0]))
                words = title.lower().split(' ')
                for word in words:
                    # отбрасываем слова единичной длины и слова из списка
                    if len(word) != 1 and word not in words_to_exclude:
                        new_word = rx3.sub(
                            u'-', (rx2.sub(u'', rx1.sub(
                                u'', word))))  # удаляем все кроме букв и '-'
                        # отбрасываем пустые слова, слова единичной длины и слова из списка
                        if new_word != '' and len(
                                new_word
                        ) != 1 and new_word not in words_to_exclude:
                            words_in_club_titles.append(new_word)
                i += 1

counts = Counter(words_in_club_titles)
top = counts.most_common(10)