Esempio n. 1
0
def test_normalize():
    normalize(pageDict)
    print(page.hub for addr, page in nlp.pagesIndex.items())
    expected_hub = [1/91**0.5, 2/91**0.5, 3/91**0.5, 4/91**0.5, 5/91**0.5, 6/91**0.5]  # Works only for sample data above
    expected_auth = list(reversed(expected_hub))
    assert len(expected_hub) == len(expected_auth) == len(nlp.pagesIndex)
    assert expected_hub == [page.hub for addr, page in sorted(nlp.pagesIndex.items())]
    assert expected_auth == [page.authority for addr, page in sorted(nlp.pagesIndex.items())]
Esempio n. 2
0
def test_normalize():
    normalize(pageDict)
    print(page.hub for addr, page in nlp.pagesIndex.items())
    expected_hub = [1/91**0.5, 2/91**0.5, 3/91**0.5, 4/91**0.5, 5/91**0.5, 6/91**0.5]  # Works only for sample data above
    expected_auth = list(reversed(expected_hub))
    assert len(expected_hub) == len(expected_auth) == len(nlp.pagesIndex)
    assert expected_hub == [page.hub for addr, page in sorted(nlp.pagesIndex.items())]
    assert expected_auth == [page.authority for addr, page in sorted(nlp.pagesIndex.items())]
Esempio n. 3
0
 def __init__(self, filename):
     f = open(filename)
     text = nlp.normalize(f.read().decode('utf-8'))
     f.close()
     self.title = re.search(r'title:\n(.+)\n', text).group(1)
     self.singer = re.search(r'singer:\n(.+)\n', text).group(1)
     self.writer = re.search(r'writer:\n(.+)\n', text).group(1)
     self.composer = re.search(r'composer:\n(.+)\n', text).group(1)
     self.year = re.search(r'year:\n(.+)\n', text).group(1)        
     self.sex = re.search(r'sex:\n(.+)\n', text).group(1)
     self.lyric = re.search(r'lyric:\n((?:.+\n)+)', text).group()
Esempio n. 4
0
def readFile(url):
    with open(url, encoding='utf-8') as FILE:
        JSON = json.load(FILE)
        for data in JSON:
            DATUM.append(data)
            tag = data['tags']
            TAGS.append((tag, 0))
            for pattern in data['patterns']:
                _pattern = normalize(pattern)
                VOCABULARY.extend(tokenization(_pattern))
                TAG_PATTERN.append((tag, _pattern))
Esempio n. 5
0
def loadFile(filename):
    f = open(filename)
    text = nlp.normalize(f.read().decode('utf-8'))
    f.close()

    texts = split(text)
    docs = []
    for text in texts:
        doc = parse(text)
        docs.append(doc)

    return docs
Esempio n. 6
0
def add_to_index(headers):
    index = get_index()
    if index == {}:
        index = defaultdict(list)

    for header in headers:
        header_words = nlp.normalize(header.split(' ^ ')[1])

        for word in header_words:
            index[word].append(header.split(' ^ ')[0])

    index_lst = []
    for x in index:
        if type(x) is not list:
            index_lst.append(x + ' ^ ' + ' '.join(index[x]) + '\n')

    db.write_to_base(db.index_db, index_lst)
Esempio n. 7
0
def prepareSlotValuesIndependent():
    domains = ['restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital', 'police']
    requestables = ['phone', 'address', 'postcode', 'reference', 'id']
    dic = []
    dic_area = []
    dic_food = []
    dic_price = []

    # read databases
    for domain in domains:
        try:
            fin = file('db/' + domain + '_db.json')
            db_json = json.load(fin)
            fin.close()

            for ent in db_json:
                for key, val in ent.items():
                    if val == '?' or val == 'free':
                        pass
                    elif key == 'address':
                        dic.append((normalize(val), '[' + domain + '_' + 'address' + ']'))
                        if "road" in val:
                            val = val.replace("road", "rd")
                            dic.append((normalize(val), '[' + domain + '_' + 'address' + ']'))
                        elif "rd" in val:
                            val = val.replace("rd", "road")
                            dic.append((normalize(val), '[' + domain + '_' + 'address' + ']'))
                        elif "st" in val:
                            val = val.replace("st", "street")
                            dic.append((normalize(val), '[' + domain + '_' + 'address' + ']'))
                        elif "street" in val:
                            val = val.replace("street", "st")
                            dic.append((normalize(val), '[' + domain + '_' + 'address' + ']'))
                    elif key == 'name':
                        dic.append((normalize(val), '[' + domain + '_' + 'name' + ']'))
                        if "b & b" in val:
                            val = val.replace("b & b", "bed and breakfast")
                            dic.append((normalize(val), '[' + domain + '_' + 'name' + ']'))
                        elif "bed and breakfast" in val:
                            val = val.replace("bed and breakfast", "b & b")
                            dic.append((normalize(val), '[' + domain + '_' + 'name' + ']'))
                        elif "hotel" in val and 'gonville' not in val:
                            val = val.replace("hotel", "")
                            dic.append((normalize(val), '[' + domain + '_' + 'name' + ']'))
                        elif "restaurant" in val:
                            val = val.replace("restaurant", "")
                            dic.append((normalize(val), '[' + domain + '_' + 'name' + ']'))
                    elif key == 'postcode':
                        dic.append((normalize(val), '[' + domain + '_' + 'postcode' + ']'))
                    elif key == 'phone':
                        dic.append((val, '[' + domain + '_' + 'phone' + ']'))
                    elif key == 'trainID':
                        dic.append((normalize(val), '[' + domain + '_' + 'id' + ']'))
                    elif key == 'department':
                        dic.append((normalize(val), '[' + domain + '_' + 'department' + ']'))

                    # NORMAL DELEX
                    elif key == 'area':
                        dic_area.append((normalize(val), '[' + 'value' + '_' + 'area' + ']'))
                    elif key == 'food':
                        dic_food.append((normalize(val), '[' + 'value' + '_' + 'food' + ']'))
                    elif key == 'pricerange':
                        dic_price.append((normalize(val), '[' + 'value' + '_' + 'pricerange' + ']'))
                    else:
                        pass
                    # TODO car type?
        except:
            pass

        if domain == 'hospital':
            dic.append((normalize('Hills Rd'), '[' + domain + '_' + 'address' + ']'))
            dic.append((normalize('Hills Road'), '[' + domain + '_' + 'address' + ']'))
            dic.append((normalize('CB20QQ'), '[' + domain + '_' + 'postcode' + ']'))
            dic.append(('01223245151', '[' + domain + '_' + 'phone' + ']'))
            dic.append(('1223245151', '[' + domain + '_' + 'phone' + ']'))
            dic.append(('0122324515', '[' + domain + '_' + 'phone' + ']'))
            dic.append((normalize('Addenbrookes Hospital'), '[' + domain + '_' + 'name' + ']'))

        elif domain == 'police':
            dic.append((normalize('Parkside'), '[' + domain + '_' + 'address' + ']'))
            dic.append((normalize('CB11JG'), '[' + domain + '_' + 'postcode' + ']'))
            dic.append(('01223358966', '[' + domain + '_' + 'phone' + ']'))
            dic.append(('1223358966', '[' + domain + '_' + 'phone' + ']'))
            dic.append((normalize('Parkside Police Station'), '[' + domain + '_' + 'name' + ']'))

    # add at the end places from trains
    fin = file('db/' + 'train' + '_db.json')
    db_json = json.load(fin)
    fin.close()

    for ent in db_json:
        for key, val in ent.items():
            if key == 'departure' or key == 'destination':
                dic.append((normalize(val), '[' + 'value' + '_' + 'place' + ']'))

    # add specific values:
    for key in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']:
        dic.append((normalize(key), '[' + 'value' + '_' + 'day' + ']'))

    # more general values add at the end
    dic.extend(dic_area)
    dic.extend(dic_food)
    dic.extend(dic_price)

    return dic
Esempio n. 8
0
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='lyric analyzer')
    parser.add_argument('infile', nargs='*')
    args = parser.parse_args()
    filenames = args.infile

    songs = []
    for filename in filenames:
        # song = load_lyric_file(filename)
        song = SongInfo(filename)
        songs.append(song)

    docs = []
    for s in songs:
        text = nlp.normalize(s.lyric)
        terms = nlp.tokenizer(text)
        terms = nlp.extract_noun(terms)
        terms = nlp.remove_stopword(terms)
        s.terms = [t.basic_form for t in terms]

    # dist = distribution([s.getDate(month=True) for s in songs])
    # for k, v in sorted(dist.items()):
    #     print k, v
    # dist = distribution([s.sex for s in songs])
    # for k, v in sorted(dist.items()):
    #     print k, v

    # by year
    year_labels = [s.getDate() for s in songs]
    year_docs = {}
Esempio n. 9
0
def queryResultVenues(domain, turn, real_belief=False):
    # query the db
    sql_query = "select * from {}".format(domain)

    if real_belief == True:
        items = turn.items()
    elif real_belief=='tracking':
        for slot in turn[domain]:
            key = slot[0].split("-")[1]
            val = slot[0].split("-")[2]
            if key == "price range":
                key = "pricerange"
            elif key == "leave at":
                key = "leaveAt"
            elif key == "arrive by":
                key = "arriveBy"
            if val == "do n't care":
                pass
            else:
                if flag:
                    sql_query += " where "
                    val2 = val.replace("'", "''")
                    val2 = normalize(val2)
                    if key == 'leaveAt':
                        sql_query += key + " > " + r"'" + val2 + r"'"
                    elif key == 'arriveBy':
                        sql_query += key + " < " + r"'" + val2 + r"'"
                    else:
                        sql_query += r" " + key + "=" + r"'" + val2 + r"'"
                    flag = False
                else:
                    val2 = val.replace("'", "''")
                    val2 = normalize(val2)
                    if key == 'leaveAt':
                        sql_query += r" and " + key + " > " + r"'" + val2 + r"'"
                    elif key == 'arriveBy':
                        sql_query += r" and " + key + " < " + r"'" + val2 + r"'"
                    else:
                        sql_query += r" and " + key + "=" + r"'" + val2 + r"'"

            try:  # "select * from attraction  where name = 'queens college'"
                return dbs[domain].execute(sql_query).fetchall()
            except:
                return []  # TODO test it
        pass
    else:
        items = turn['metadata'][domain]['semi'].items()

    flag = True
    for key, val in items:
        if val == "" or val == "dontcare" or val == 'not mentioned' or val == "don't care" or val == "dont care" or val == "do n't care":
            pass
        else:
            if flag:
                sql_query += " where "
                val2 = val.replace("'", "''")
                val2 = normalize(val2)
                if key == 'leaveAt':
                    sql_query += r" " + key + " > " + r"'" + val2 + r"'"
                elif key == 'arriveBy':
                    sql_query += r" " +key + " < " + r"'" + val2 + r"'"
                else:
                    sql_query += r" " + key + "=" + r"'" + val2 + r"'"
                flag = False
            else:
                val2 = val.replace("'", "''")
                val2 = normalize(val2)
                if key == 'leaveAt':
                    sql_query += r" and " + key + " > " + r"'" + val2 + r"'"
                elif key == 'arriveBy':
                    sql_query += r" and " + key + " < " + r"'" + val2 + r"'"
                else:
                    sql_query += r" and " + key + "=" + r"'" + val2 + r"'"

    try:  # "select * from attraction  where name = 'queens college'"
        return dbs[domain].execute(sql_query).fetchall()
    except:
        return []  # TODO test it
Esempio n. 10
0
            if (inputBag[index] == patternBag[index]
                or inputBag[index] and patternBag[index]): hit += patternBag[index]
        index = TAGS.index((dup_tag_pattern[dup_tag_pattern.index(pattern)][0], 0))
        if pattern[0] == dup_tags[index][0]:
            tmp = list(dup_tags[index])
            tmp[1] += hit/patternLength
            dup_tags[index] = tuple(tmp)
    #find BIGGEST rate
    max = 0.5
    index = -1
    for i in range(len(dup_tags)):
        if dup_tags[i][1] > max:
            max = dup_tags[i][1]
            index = i
    #response
    if index == -1: print("AI : Xin lỗi tôi không hiểu")
    else:
        responseLength = len(DATUM[index]['response'])
        print("AI: ",DATUM[index]['response'][random.randrange(0, responseLength)])
        if index == 1: return 0
        else: return 1

readFile(URL)
print('Chào mừng đến với ChatBot!\n')
while True:
    userInput = input("You: ")
    userInput = normalize(userInput)
    inputLength = len(userInput.split(" "))
    inputBag = getBag(userInput, VOCABULARY)
    if not training(inputBag): break
print('ChatBot kết thúc...')
Esempio n. 11
0
def load_tweets_from_csv(filename):
    ret = csv.reader(open(filename))
    tweets = [nlp.normalize(r[5].decode('utf-8')) for r in ret]
    tweets = [t for t in tweets if not u'@' in t]
    return tweets