Esempio n. 1
 def check_headline(headline):
   headline = HTMLParser().unescape(headline)
   headline = headline.lower()
   tokens = nltk.word_tokenize(headline)
   tagged = NLPParser.t3.tag(tokens)
   entities = nltk.ne_chunk(tagged, binary=True)
   parsed = NLPParser.cp.parse(entities)
   for node in parsed:
     if type(node) is nltk.Tree and node.label() == 'SBAR':
       sentence = node.leaves()
       sentence = ' '.join([t[0] for t in sentence])
       return sentence, 'sentence'
   tag_sequence = [t[1] for t in tagged]
   for node in parsed:
     if type(node) is nltk.Tree and node.label() == 'NE':
       subject = node.leaves()
       subject = ' '.join([t[0] for t in subject])
       return subject, 'topic'
   np_len = 0
   np_topic = []
   for node in parsed:
     if type(node) is nltk.Tree and node.label() == 'NP':
       np = node.leaves()
       if len(np) >= np_len and 'PP' not in np:
         np_len = len(np)
         np_topic = np
   if np_topic:
     np_topic = ' '.join([t[0] for t in np_topic])
     return np_topic, 'topic'
     for t in tag_sequence:
       if re.match(r'VB.*', t):
         return NLPParser.simple_headline(headline), 'sentence'
     return NLPParser.simple_headline(headline), 'topic'
Esempio n. 2
def uncode_name(
):  # convert all the &# codes to char, remove extra-space and normalize
    from HTMLParser import HTMLParser
    name = name.replace('<![CDATA[', '').replace(']]', '')
    name = HTMLParser().unescape(name.lower())
    return name
Esempio n. 3
def cleanUpMatch(match):
    # remove everything between double line breaks
    match = re.sub(r'\n\s*\n.*?\n\s*\n', '', match, flags=re.DOTALL)

    # remove soft hyphen
    match = re.sub(r'&#xAD;(\n)*(<lb/>)*', '', match)

    # remove box character
    match = re.sub(r'\xc2\xad(\n)*(<lb/>)*', '', match)

    # replace <lb/> or '\n' with ' '
    match = re.sub(r'<lb/>|\n', ' ', match)

    # remove everything between '<>'
    match = re.sub(r'<.*?>', '', match)

    # translate special characters
    for char in TRANSLATIONS:
        match = match.replace(char, TRANSLATIONS[char])

    # remove extra white spaces
    match = ' '.join(match.split())

    # decode html characters
    match = HTMLParser().unescape(match)

    # always normalize to lower case
    return match.lower()
Esempio n. 5
def normalize_string(string, charset=None, replacing=False):
    Decode and Convert to Unicode any string
    :param charset: encoding
    :type charset: str
    :param string: string to convert
    :type string: str or unicode
    :param replacing: Whether is ' is replaced
    :type replacing: bool
    :return: converted unicode
    :rtype: unicode
    if not isinstance(string, unicode):
            if'=[0-9a-fA-F]{2}', string):
                string = string.decode('Quoted-printable')

            string = json.loads(u'%s' % string, encoding=charset)

        except ValueError:
                string = unicode(eval(string), 'raw_unicode_escape')

            except (SyntaxError, NameError):
                string = string.decode('latin-1')

            except TypeError:
                string = unicode(string, errors='ignore')

        except LookupError:
            return u''

        except TypeError:
            string = unicode(string, errors='ignore')

    string = remove_control_chars(string)
    string = fix_bad_unicode(string)
    string = unquote(string)
    string = string.replace(u'<![CDATA[', u'').replace(u']]', u'')
    string = HTMLParser().unescape(string)
    if replacing:
        string = string.replace(u"'", '')

    string = string.lower()

    return string
Esempio n. 6
Esempio n. 7
Esempio n. 8
def populate_restaurants(c):
    print 'Populating Restaurants table...'

    if not (os.access('restaurants', os.R_OK)
            and os.path.isdir('restaurants')):
        print >> sys.stderr, "Error: cannot access raw data directory 'restaurants'"

    if not (os.access('suburbs.txt', os.R_OK)
            and os.path.isfile('suburbs.txt')):
        print >> sys.stderr, "Error: cannot access raw data file 'suburbs.txt'"

    #get postcodes from file and cache in dict
    suburbs = open('suburbs.txt').readlines()
    postcodes = {}
    for suburb in suburbs:
        lat, lng, pst, sub = suburb.strip().split('\t')
        postcodes[sub] = pst
    postcodes['CBD'] = 2000  #special case not in data file

    users = c.execute('SELECT username FROM Users').fetchall()
    num_users = c.execute('SELECT COUNT(*) FROM Users').fetchone()[0]

    i = 0
    for restaurant in glob.glob('restaurants/*'):
        r = open(restaurant).readlines()

        #extract info from file
            name = r[0].strip()
            name = HTMLParser().unescape(name)
            address = r[1].strip()
            address = HTMLParser().unescape(address)
            address = re.sub(r'nsw', 'NSW', address, flags=re.I)
            if not address.endswith(', NSW'):
                address = address + ', NSW'
            suburb = re.match(r'.*, (.+), Sydney', r[1]).group(1)
            suburb = HTMLParser().unescape(suburb)
            phone = r[2].strip().replace('(', '').replace(')', '')
            if re.match('Not available', phone):
                phone = 'Not provided'
            hours = r[3].strip()
            hours = re.sub(r'\s*,\s*', ', ', hours)
            hours = HTMLParser().unescape(hours)
            cuisine = r[4].strip()
            cuisine = HTMLParser().unescape(cuisine)
            cost = r[5].strip()
            image = r[6].strip()
            print >> sys.stderr, "Error: skipping '%s'" % restaurant

        #lookup postcode using suburb
        postcode = ''
        if not suburb in postcodes:
            postcode = postcodes[suburb]

        #and append it to the address
        address = address + ' ' + str(postcode)

        #chose a random protocol for the website
        protocol = 'http://'
        if random.randint(0, 1) == 1:
            protocol = 'https://'

        #make site of the form protocol://
        website = name.replace('  ', ' ').replace(' ', '.').replace(
            '-', '').strip() + ''
        website = HTMLParser().unescape(website)
        website = urllib.quote(website)  #encode as url
        website = protocol + 'www.' + website  #avoid encoding the protocol
        website = website.lower().replace('..', '.')

        #ensure only some restaurants have owners
        owner = None
        if random.randint(0, 3) == 0:
            owner = users[random.randint(0, num_users - 1)][0]

        i += 1
        data = (i, name, suburb, address, postcode, phone, hours, cuisine,
                owner, website, cost, image)
            '''INSERT INTO Restaurants
				(id, name, suburb, address, postcode, phone, hours, cuisine, owner, website, cost, image)
				VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data)
Esempio n. 9
f ='vk.txt', mode = 'a', encoding= 'utf-8')
f.write(u'количество клубов - ' + str(quantity_of_clubs) + u'\n')

i = 1
while i <= quantity_of_clubs:
    club_number = randrange(1000000, 30000000)
    if club_number not in visited_clubs:
        data = urlopen('' % club_number).read().decode('utf8')
        title = HTMLParser().unescape(re.findall('<title>(.*)</title>', data)[0])
        if title not in ([u'Ошибка', u'Частная группа']):
            quantity = HTMLParser().unescape(re.findall(u'Участники\s+<em class="pm_counter">(\d+)</em>',data))
            if quantity:
                words = title.lower().split(' ')
                for word in words:
                    # отбрасываем слова единичной длины и слова из списка
                    if len(word) != 1 and word not in words_to_exclude:
                        new_word = rx3.sub(u'-', (rx2.sub(u'', rx1.sub(u'',word)))) # удаляем все кроме букв и '-'
                        # отбрасываем пустые слова, слова единичной длины и слова из списка
                        if new_word != '' and len(new_word) != 1 and new_word not in words_to_exclude:
                i +=1

counts = Counter(words_in_club_titles)
top = counts.most_common(10)
for element in top:
    print "'%s', %d " %(element[0], element[1])
    f.write("'%s', %d " %(element[0], element[1]))
Esempio n. 10
