Example #1
0
def getLinks(url, soup):
    """
        Returns a list of urls
    """
    currentWords = []
    listURLs = []
    wordsInPage = soup.get_text()
    for word in wordsInPage.split():
        currentWords.append(word)
        if word not in db:
            db[word] = [(util.clean_words(soup.title.text), url)]
        elif word not in currentWords:
            db[word].append((util.clean_words(soup.title.text), url))
    for link in soup.find_all("a"):
        newUrl = urllib.parse.urljoin(url, link.get("href"))
        listURLs.append(newUrl)
    return listURLs
Example #2
0
def search():
    """Returns the results page."""
    query = util.clean_words(request.args.get("query", ""))
    if query == "":
        return render_template("index.html")
    else:
        results = moogle.answer(app.db, query)
        return render_template("search.html", authors=moogle.authors(), query=query, results=results)
Example #3
0
def sanitizeText(text):
    try:
        text = util.clean_words(text)
    except:
        text.encode('utf-8')

    text = text.split(' ')
    filter(None, text)
    return [word for word in text if word not in STOP_WORDS]
    def make_data(self,
                  trainfilename,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        use_attention = self.params["use_attention"]
        batch_size = self.params["batch_size"]

        str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train)
        print("Filtering data")
        str_seqs = clean_words(str_seqs)
        label_seqs = to_BIO(label_seqs)
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if self.maxseqlen is None:
            if maxseqlen:
                self.maxseqlen = maxseqlen
            elif self.params["maxseqlen"] is not None:
                self.maxseqlen = self.params["maxseqlen"]
            else:
                self.maxseqlen = max(seq_lengths)
        if self.maxclauselen is None:
            if maxclauselen:
                self.maxclauselen = maxclauselen
            elif self.params["maxclauselen"] is not None:
                self.maxclauselen = self.params["maxclauselen"]
            elif use_attention:
                sentence_lens = []
                for str_seq in str_seqs:
                    for seq in str_seq:
                        tokens = self.tokenizer.tokenize(seq.lower())
                        sentence_lens.append(len(tokens))
                self.maxclauselen = np.round(
                    np.mean(sentence_lens) +
                    3 * np.std(sentence_lens)).astype(int)

        if len(self.label_ind) <= 1:
            for str_seq, label_seq in zip(str_seqs, label_seqs):
                for label in label_seq:
                    if label not in self.label_ind:
                        # Add new labels with values 0,1,2,....
                        self.label_ind[label] = len(self.label_ind)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        discourse_generator = BertDiscourseGenerator(self.bert, self.tokenizer,
                                                     str_seqs, label_seqs,
                                                     self.label_ind,
                                                     batch_size, use_attention,
                                                     self.maxseqlen,
                                                     self.maxclauselen, train)
        return seq_lengths, discourse_generator  # One-hot representation of labels
Example #5
0
def crawler_aux(db, url, maxdist):
    if maxdist > 0:
        try:
            response = urllib.request.urlopen(url)
            page = response.read()
            soup = BeautifulSoup(page, "html.parser")
            db[url] = (clean_words(soup.title.text),
                       set(
                           clean_words(soup.get_text() +
                                       soup.title.text).split(" ")))

            for link in soup.find_all("a"):
                link_str = urllib.parse.urljoin(url, link.get("href"))
                if link_str not in db and link_str[len(link_str) -
                                                   5:] == '.html':
                    crawler_aux(db, link_str, maxdist - 1)

        except Exception as e:
            print("Exception found while reading the webpage {}\n{}\n".format(
                url, e))
    def make_data(self,
                  trainfilename,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        use_attention = self.params["use_attention"]
        maxseqlen = self.params["maxseqlen"]
        maxclauselen = self.params["maxclauselen"]
        batch_size = self.params["batch_size"]

        str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train)
        print("Filtering data")
        str_seqs = clean_words(str_seqs)
        label_seqs = to_BIO(label_seqs)
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if not maxseqlen:
            maxseqlen = max(seq_lengths)
        if not maxclauselen:
            if use_attention:
                clauselens = []
                for str_seq in str_seqs:
                    clauselens.extend(
                        [len(clause.split()) for clause in str_seq])

                maxclauselen = np.round(
                    np.mean(clauselens) + 3 * np.std(clauselens)).astype(int)
        X = []
        Y = []
        Y_inds = []
        init_word_rep_len = len(self.rep_reader.word_rep)  # Vocab size
        if len(self.label_ind) <= 1:
            for str_seq, label_seq in zip(str_seqs, label_seqs):
                for label in label_seq:
                    if label not in self.label_ind:
                        # Add new labels with values 0,1,2,....
                        self.label_ind[label] = len(self.label_ind)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        discourse_generator = DiscourseGenerator(self.rep_reader, str_seqs,
                                                 label_seqs, self.label_ind,
                                                 batch_size, use_attention,
                                                 maxseqlen, maxclauselen,
                                                 train, self.input_size)
        self.maxseqlen = maxseqlen
        self.maxclauselen = maxclauselen
        return seq_lengths, discourse_generator  # One-hot representation of labels
Example #7
0
def main():

    parser = argparse.ArgumentParser(
        description="Answer module for the µoogle project",
        epilog=moogle.authors(),
    )

    parser.add_argument("-q", "--query", type=str, help="query (use quotes for more than one word")
    parser.add_argument("-d", "--database", type=str, help="filename of the database", default="moogle.dat")

    args = parser.parse_args(sys.argv[1:])

    db = moogle.load(args.database)
    query = util.clean_words(args.query)
    answer = moogle.answer(db, query)
    pprint.pprint(answer)
 def calc(self, input_str, points):
     # Trivial case when there are no words.
     if input_str is None:
         return
     try:
         clear = util.clean_words(
             input_str)  # Clean the words from "litter".
         words = clear.split(" ")  # Split the string into several words.
         # We check for every word of the string.
         for word in words:
             if word == '':
                 continue
             sum_points = self.words_weight.get(
                 word)  # Initially set to 0, we will accumulate the score.
             if word in self.CONST_STOP_WORDS:  # If the word is actually a STOP WORD...
                 x = points * 0.1
             else:
                 x = points
             if sum_points is None:
                 self.words_weight[word] = x
             else:
                 self.words_weight[word] += x
     except ValueError as e:
         print(e)
Example #9
0
def crawler(url, maxdist):
    """
        Crawls the web starting from url,
        following up to maxdist links
        and returns the built database.
    """

    # Please implement this function

    dict = {}  # db: diccionari (paraula apareguda -> [(titol,url) de web)]

    webs = []  # llista de webs a tractar
    webs.append(tuple((url, 0)))  # la url original, amb profunditat 0
    # A webs guardarem totes les webs a guardar, amb parells (url,depth)
    # Realment, no podem saber si un link es HTML o no fins que no l'obrim
    # i mirem el content type. AMb l'extensio no n'hi ha prou.
    # Per exemple, apple.com/education es HTML perque el servidor serveix
    # el corresponent index.html

    # for (web, depth) in webs:
    i = 0
    used_webs = set([url])  # webs ja visitades
    # utilitzem while en comptes d'un iterador perque la llista es
    # dinamica (es fan appends dins de l'iteracio), i llavors queda
    # mes clar aixi.
    while i < len(webs):
        (web, depth) = webs[i]
        try:
            # timeout de 3, el valor per defecte es massa gran
            response = urllib.request.urlopen(web, timeout=3)
            content_type = response.info().get('Content-Type')
            # Posem aquesta condicio en comptes de
            # content_type = 'text/html', perque algunes webs
            # indiquen mes coses al content type
            if "html" not in content_type:
                break
            page = response.read()
            soup = BeautifulSoup(page, "html.parser")

        # Si salta alguna excepcio sera la corresponent a un error
        # a l'establir la connexio (de sockets, handshake...) o
        # HTTP (404 not found, 403 forbidden...)
        except Exception as e:
            print(e)
            i += 1
            continue
        try:
            # Com hem dit, cada paraula trobada en alguna de les
            # webs tindra una entrada al diccionari amb el
            # conjunt de les webs (titol,url) on aparegui
            title = clean_words(soup.title.text)
            text = clean_words(soup.get_text())
            content = title + text
            added_words = set([])
            # Per totes les paraules del contingut de la web
            for word in content.split():
                if word in added_words:
                    continue
                if word not in dict:
                    dict[word] = set([tuple((title, web))])
                else:
                    dict[word].add(tuple((title, web)))
                added_words.add(word)

        # Es podria produir una excepcio si una web no tingues
        # contingut, o si per algun caracter especial falles
        # clean_words
        except Exception as e:
            print(e)
        # Afegim tots els links no repetits i amb protocol hhtp
        # o https
        # Si ja estem a la maxima profunditat, no mirarem cap link
        # Sumarem 1 a la profunditat dels fills, respecte la del pare
        if depth < maxdist:
            for link in soup.find_all("a"):
                try:
                    newurl = urllib.parse.urljoin(web, link.get("href"))
                    (newweb, fragment) = urllib.parse.urldefrag(newurl)
                    if newweb.startswith("http") and newweb not in used_webs:
                        used_webs.add(newweb)
                        webs.append(tuple((newweb, depth + 1)))
                # urljoin pot provocar una excepcio si la url
                # esta mal formada
                except Exception as e:
                    print(e)
                    continue
        i += 1
    return dict
Example #10
0
 def get_title(self):
     return util.clean_words(self.title)