Exemple #1
0
def test_stemming():
    wordsToStem = ["tokenized", "roses", "brutally", "dozens", "stupidity", "notorious", "english", "stemming", "stemmed", "soft"]
    for i in range(0, len(wordsToStem)-1):
        wordsToStem[i] = preprocessing.stem(wordsToStem[i])
    stemmedWords = ["token", "rose", "brutal", "dozen", "stupid", "notori", "english", "stem", "stem", "soft"]
    for a, b in zip(wordsToStem, stemmedWords):
        assert(a == b)
def processBody(text):
    # print('Body: ',text)
    data = re.sub(r'\{\{.*\}\}', r' ', text)
    data = tokenise(data)
    data = remove_stopwords(data)
    data = stem(data)
    # print('Body: ',data)
    return data
def processTitle(title):
    # print('Title before', title)
    title = title.lower()
    title = tokenise(title)
    title = remove_stopwords(title)
    title = stem(title)
    # print('Title: ', title)
    return title
def processCategories(text):
    data = text.split('\n')
    categories = []
    for line in data:
        if re.match(r'\[\[category', line):
            categories.append(re.sub(r'\[\[category:(.*)\]\]', r'\1', line))
    data = tokenise(' '.join(categories))
    data = remove_stopwords(data)
    data = stem(data)
    # print('Categories: ', data)
    return data
def processLinks(text):
    data = text.split('\n')
    links = []
    for line in data:
        if re.match(r'\*[\ ]*\[', line):
            links.append(line)
    data = tokenise(' '.join(links))
    data = remove_stopwords(data)
    data = stem(data)
    # print('Links: ', )
    return data
Exemple #6
0
    def advQueryProcessing(self, engine):
        """Processing based on self.advOptions values
            options is a dictionary of following terms
            "allterms"
            "songname"
            "songend"
            "artist",
            "artistend"
            "genre"
            "pos"
            "from"
            "to"
            Arguments:
                engine -- object to which the Query object belongs
        """
        print("******Advanced query processing******")
        starttime = datetime.datetime.now()
        print("Start time", str(starttime))
        print("Query---", self.queryText)

        self.queryEngine = engine
        q_tokens = tokenize(self.queryText)  #tokenize queryText
        qs_tokens, qs_dict = stem(q_tokens)
        #    if len(qs_tokens) == 0:   ## uncomment to disallow blank query text
        #engine.noResult = True
        #return

        # all the documents satisfying the criteria
        querydocs = self.getAdvResults(qs_tokens)

        #Ranking based on score
        songHeap = []
        for doc in querydocs:
            if (type(querydocs) == type(
                [])):  # case where score is not available
                song = selSong(doc, {})
            else:
                song = selSong(
                    doc, querydocs[doc])  # calculates score document at a time
            songHeap.append(song)
        if len(songHeap) == 0:
            #engine.noResult = True
            self.noResult = True
            printDuration(starttime)
            return
        self.queryResult = songHeap
        songHeap.sort(reverse=True)

        #Fetching song details
        count = engine.displayLength
        self.nextSongListPrep(0, count)

        # retrieval time
        printDuration(starttime)
Exemple #7
0
def build_indexes():
    """Indexing the documents and updating into db"""

    print("Building all indexes ")
    conn = create_connection(dbname)
    cur = conn.cursor()
    cur.execute(
        "CREATE TABLE IF NOT EXISTS terms(term TEXT PRIMARY KEY,cfreq INTEGER,dfreq INTEGER)"
    )
    cur.execute(
        """CREATE TABLE IF NOT EXISTS termdoc(term INTEGER,docid INTEGER,
                    tfreq INTEGER,dscore REAL,posList TEXT,
                    FOREIGN KEY (term)REFERENCES terms(term),
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (term, docid))""")
    cur.execute("""CREATE TABLE IF NOT EXISTS permArtist(key Text,docid Text,
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (key))""")
    cur.execute("""CREATE TABLE IF NOT EXISTS permName(key Text,docid Text,
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (key))""")
    cur.execute(
        """CREATE TABLE IF NOT EXISTS genreDoc(genre Text,docid INTEGER,
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (genre, docid))""")
    cur.execute("""CREATE TABLE IF NOT EXISTS yearDoc(year Text,docid INTEGER,
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (year, docid))""")
    cur.execute('SELECT * FROM songs')
    for row in cur:
        tokens = tokenize(row[5])
        if not tokens:
            continue
        stem_tokens, term_dict_local = stem(tokens)  #PorterStemmer
        updateTermTable(row[0], term_dict_local, conn)
        updateGenreYear(row[4], row[2], row[0], conn)
        permuterm(row[3], row[0], conn, "permArtist")  # perm artist
        permuterm(row[1], row[0], conn, "permName")  #Name
    print("calculating tfidf")
    calculate_Tf_Idf(cur, conn)
    now = datetime.datetime.now()
    print(str(now))
    conn.commit()
    conn.close()

    now = datetime.datetime.now()
    print(str(now))
Exemple #8
0
    def queryProcessing(self, engine):
        """query processing for basic Search
            Arguments:
                engine -- object to which the Query object belongs
        """
        self.queryEngine = engine
        print("*********Query processing********")
        starttime = datetime.datetime.now()
        print("Start time", str(starttime))
        print("Query---", self.queryText)

        q_tokens = tokenize(self.queryText)  #tokenize queryText
        qs_tokens, qs_dict = stem(q_tokens)

        if len(qs_tokens) == 0:
            #engine.noResult = True
            self.noResult = True  #choose one
            return

        # all the documents that contain any of the terms
        querydocs = self.getIndexes(qs_tokens)

        #Ranking based on score
        songHeap = []
        for doc in querydocs:  #scoring document at a time
            song = selSong(doc, querydocs[doc])
            songHeap.append(song)

        if len(songHeap) == 0:
            #engine.noResult = True
            self.noResult = True
            return

        self.queryResult = songHeap
        songHeap.sort(reverse=True)
        #Fetching song details
        start = 0
        count = None
        if self.topSearch == True:
            count = 1
        else:
            count = engine.displayLength
        self.nextSongListPrep(0, count)

        # retrieval time
        printDuration(starttime)
def processInfo(text):
    data = text.split('\n')
    flag = -1
    info = []
    st = "}}"
    for line in data:
        if re.match(r'\{\{infobox', line):
            info.append(re.sub(r'\{\{infobox(.*)', r'\1', line))
            flag = 0
        elif flag == 0:
            if line == st:
                flag = -1
                continue
            info.append(line)
    data = tokenise(' '.join(info))
    data = remove_stopwords(data)
    data = stem(data)
    # print("Info: ", data)
    return data
Exemple #10
0
def run(infile, vec_size, name="narr+ice+medhelp", stem=False):

    bin_dir = "/u/sjeblee/tools/word2vec/word2vec/bin"
    #bin_dir = "/u/yoona/word2vec/bin"

    # Input data
    data_dir = "/u/sjeblee/research/va/data/datasets/mds+rct/crossval_sets"
    #data_dir = "/u/yoona/test/mds+rct" # hard-coded, Yoona's data location
    ice_data = "/u/sjeblee/research/data/ICE-India/Corpus/all-lower.txt"
    #ice_data = "/u/yoona/mds+rct/ice_all_lower.txt"
    medhelp_data = "/u/sjeblee/research/data/medhelp/all_medhelp_clean_lower.txt"
    #medhelp_data = "/u/yoona/mds+rct/all_medhelp_clean_lower.txt"
    suffix = ".narrsent"
    if stem:
        ice_data = "/u/sjeblee/research/data/ICE-India/Corpus/all-lower-stem.txt"
        medhelp_data = "/u/sjeblee/research/data/medhelp/all_medhelp_clean_stem.txt"
        suffix = ".narrsent.stem"

    # Output data
    text_data = data_dir + "/" + name + ".txt"
    vec_data = data_dir + "/" + name + ".vectors." + str(vec_size)
    if stem:
        text_data = data_dir + "/" + name + "_stem.txt"
        vec_data = data_dir + "/" + name + "_stem.vectors." + str(vec_size)

    # Quit if vectors already exist
    if os.path.exists(vec_data):
        print "Vectors already exist, quitting"
        return vec_data

    # Extract narrative text from input file
    narrs = extract_features.get_narrs(infile)
    train_data = infile + suffix
    outfile = open(train_data, "w")
    for narr in narrs:
        if stem:
            narr = preprocessing.stem(narr)
        outfile.write(narr + "\n")
    outfile.close()

    # Combine all the text
    #filenames = [ice_data, medhelp_data, train_data]
    filenames = [train_data]
    #filenames = [medhelp_data, train_data]
    sentences = []
    outfile = open(text_data, "w")
    for fname in filenames:
        with open(fname) as inf:
            for line in inf.readlines():
                line = unicode(line, errors='ignore')
                outfile.write(line.strip().encode('utf8'))
                sentences.append(line.strip())
    outfile.close()

    #rm -f $VECTOR_DATA
    window_size = 5
    num_threads = 12
    #sentences = []
    #with open(outfile) as f:
    #    for line in f.readlines():
    #        sentences.append(line.strip())

    print "-- Training vectors..."
    #vec_model = Word2Vec(sentences, size=int(vec_size), window=window_size, min_count=1, workers=num_threads, negative=0, sg=1)
    #vec_model = FastText(sentences, size=int(vec_size), window=window_size, min_count=1, word_ngrams=1, min_n=2, max_n=6, workers=num_threads, negative=0)
    #vec_model.save(vec_data)
    #vec_model.wv.save_word2vec_format(vec_data + ".vec")

    if not os.path.exists(vec_data):
        print "--------------------------------------------------------------------"
        process = subprocess.Popen([
            "time", bin_dir + "/word2vec", "-train", text_data, "-output",
            vec_data, "-cbow", "1", "-size",
            str(vec_size), "-window",
            str(window_size), "-negative", "0", "-hs", "1", "-min-count", "1",
            "-sample", "1e-3", "-threads",
            str(num_threads), "-binary", "0"
        ],
                                   stdout=subprocess.PIPE)
        output, err = process.communicate()
        print output
        #time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size $DIM -window 5 -negative 0 -hs 1 -min-count 1 -sample 1e-3 -threads 12 -binary 0

        print "-- Training binary vectors..."
        process = subprocess.Popen([
            "time", bin_dir + "/word2vec", "-train", text_data, "-output",
            vec_data + ".bin", "-cbow", "1", "-size",
            str(vec_size), "-window",
            str(window_size), "-negative", "0", "-hs", "1", "-min-count", "1",
            "-sample", "1e-3", "-threads",
            str(num_threads), "-binary", "1"
        ],
                                   stdout=subprocess.PIPE)
        output, err = process.communicate()
        print output
        #time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA.bin -cbow 0 -size $DIM -window 5 -negative 0 -hs 1 -min-count 1 -sample 1e-3 -threads 12 -binary 1

    print "-------------------------------------------------------------------------"
    #echo -- distance...
    #$BIN_DIR/distance $VECTOR_DATA.bin

    return vec_data
def extract(infile, outfile, dict_keys, stem=False, lemma=False, element="narrative", arg_rebalance=""):
    train = False
    narratives = []
    keywords = []
    
    # Get the xml from file
    root = etree.parse(infile).getroot()

    if dict_keys == None:
        train = True

        # Set up the keys for the feature vector
        dict_keys = ["MG_ID", labelname]
        if checklist in featurenames:
            dict_keys = dict_keys + ["CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation", "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke", "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma","CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD", "CL_ApplytobaccoD"]
        elif dem in featurenames:
            dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"]
        print "dict_keys: " + str(dict_keys)
        #keywords = set([])
        #narrwords = set([])

    print "train: " + str(train)
    print "stem: " + str(stem)
    print "lemma: " + str(lemma)
    # Extract features
    matrix = []
    for child in root:
        features = {}

        if rec_type in featurenames:
            features["CL_" + rec_type] = child.tag

        # CHECKLIST features
        for key in dict_keys:
            if key[0:3] == "CL_":
                key = key[3:]
            item = child.find(key)
            value = "0"
            if item != None:
                value = item.text
            if key == "AlcoholD" or key == "ApplytobaccoD":
                if value == 'N':
                    value = 9
            features[key] = value
            #print "-- value: " + value
            #if key == "MG_ID":
            #    print "extracting features from: " + value

        # KEYWORD features
        if kw_features:
            keyword_string = get_keywords(child)
            # Remove punctuation and trailing spaces from keywords
            words = [s.strip().translate(string.maketrans("",""), string.punctuation) for s in keyword_string.split(',')]
            # Split keyword phrases into individual words
            for word in words:
                w = word.split(' ')
                words.remove(word)
                for wx in w:
                    words.append(wx.strip().strip('–'))
            keywords.append(" ".join(words))
                
        # NARRATIVE features
        if narr_features or ((not train) and (symp_train in featurenames)):
            narr_string = ""
            item = child.find(element)
            if item != None:
                if item.text != None:
                    narr_string = item.text.encode("utf-8")
                else:
                    print "warning: empty narrative"
                narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')]
                text = " ".join(narr_words)

                if stem:
                    narr_string = preprocessing.stem(text)
                elif lemma:
                    narr_string = preprocessing.lemmatize(text)
            narratives.append(narr_string.strip().lower())
            #print "Adding narr: " + narr_string.lower()

        # SYMPTOM features
        elif train and (symp_train in featurenames):
            narr_string = ""
            item = child.find("narrative_symptoms")
            if item != None:
                item_text = item.text
                if item_text != None and len(item_text) > 0:
                    narr_string = item.text.encode("utf-8")
                    #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')]
            narratives.append(narr_string.lower())
            print "Adding symp_narr: " + narr_string.lower()

        # Save features
        matrix.append(features)

    # Construct the feature matrix

    # COUNT or TFIDF features
    if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
        documents = []
        if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
            documents = narratives
            print "narratives: " + str(len(narratives))
        elif kw_count in featurenames or kw_tfidf in featurenames:
            documents = keywords
            print "keywords: " + str(len(keywords))

        # Create count matrix
        global count_vectorizer
        if train:
            print "training count_vectorizer"
            count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(min_ngram,max_ngram),stop_words=stopwords)
            count_vectorizer.fit(documents)
            dict_keys = dict_keys + count_vectorizer.get_feature_names()
        print "transforming data with count_vectorizer"
        count_matrix = count_vectorizer.transform(documents)
        matrix_keys = count_vectorizer.get_feature_names()

        print "writing count matrix to file"
        out_matrix = open(infile + ".countmatrix", "w")
        out_matrix.write(str(count_matrix))
        out_matrix.close()

        # Add count features to the dictionary
        for x in range(len(matrix)):
            feat = matrix[x]
            for i in range(len(matrix_keys)):
                key = matrix_keys[i]
                val = count_matrix[x,i]
                feat[key] = val

        # Convert counts to TFIDF
        if (narr_tfidf in featurenames) or (kw_tfidf in featurenames):
            print "converting to tfidf..."
            print "matrix_keys: " + str(len(matrix_keys))

            # Use the training count matrix for fitting
            if train:
                global tfidfTransformer
                tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer()
                tfidfTransformer.fit(count_matrix)

            # Convert matrix to tfidf
            tfidf_matrix = tfidfTransformer.transform(count_matrix)
            print "count_matrix: " + str(count_matrix.shape)
            print "tfidf_matrix: " + str(tfidf_matrix.shape)

            # Replace features in matrix with tfidf
            for x in range(len(matrix)):
                feat = matrix[x]
                #values = tfidf_matrix[x,0:]
                #print "values: " + str(values.shape[0])
                for i in range(len(matrix_keys)):
                    key = matrix_keys[i]
                    val = tfidf_matrix[x,i]
                    feat[key] = val

        # LDA topic modeling features
        if lda in featurenames:
            global ldaModel
            if train:
                ldaModel = LatentDirichletAllocation(n_topics=num_topics)
                ldaModel.fit(count_matrix)
            lda_matrix = ldaModel.transform(count_matrix)
            for t in range(0,num_topics):
                dict_keys.append("lda_topic_" + str(t))
            for x in range(len(matrix)):
                for y in range(len(lda_matrix[x])):
                    val = lda_matrix[x][y]
                    matrix[x]["lda_topic_" + str(y)] = val

            # TODO: Print LDA topics

    # WORD2VEC features
    elif narr_vec in featurenames:
        print "Warning: using word2vec features, ignoring all other features"

        # Create word2vec mapping
        word2vec, dim = load_word2vec(vecfile)

        # Convert words to vectors and add to matrix
        dict_keys.append(narr_vec)
        global max_seq_len
        max_seq_len = 200
        #if train:
            #max_seq_len = 0
        print "word2vec dim: " + str(dim)
        print "initial max_seq_len: " + str(max_seq_len)
        zero_vec = []
        for z in range(0, dim):
            zero_vec.append(0)
        for x in range(len(matrix)):
            narr = narratives[x]
            #print "narr: " + narr
            vectors = []
            vec = zero_vec
            for word in narr.split(' '):
                if len(word) > 0:
                    #if word == "didnt":
                    #    word = "didn't"
                    if word in word2vec:
                        vec = word2vec[word]
                    vectors.append(vec)
            length = len(vectors)
            if length > max_seq_len:
                #if train:
                #    max_seq_len = length
                vectors = vectors[(-1*max_seq_len):]
            (matrix[x])[narr_vec] = vectors

        # Pad the narr_vecs with 0 vectors
        print "padding vectors to reach maxlen " + str(max_seq_len)
        for x in range(len(matrix)):
            length = len(matrix[x][narr_vec])
            matrix[x]['max_seq_len'] = max_seq_len
            if length < max_seq_len:
                for k in range(0, max_seq_len-length):
                    matrix[x][narr_vec].insert(0,zero_vec) # use insert for pre-padding

    # narr_seq for RNN
    elif narr_seq in featurenames:
        global vocab_size, max_seq_len
        if train:
            dict_keys.append(narr_seq)
            dict_keys.append('vocab_size')
            dict_keys.append('max_seq_len')
            vocab = set()
            for narr in narratives:
                words = narr.split(' ')
                for word in words:
                    vocab.add(word)
            vocab_size = len(vocab)
            max_seq_len = 0

        sequences = []

        # Convert text into integer sequences
        for x in range(len(matrix)):
            narr = narratives[x]
            seq = hashing_trick(narr, vocab_size, hash_function='md5', filters='\t\n', lower=True, split=' ')
            if len(seq) > max_seq_len:
                max_seq_len = len(seq)
            sequences.append(seq)

        # Pad the sequences
        sequences = pad_sequences(sequences, maxlen=max_seq_len, dtype='int32', padding='pre')
        for x in range(len(matrix)):
            matrix[x]['narr_seq'] = sequences[x]
            matrix[x]['vocab_size'] = vocab_size
            matrix[x]['max_seq_len'] = max_seq_len

    #if arg_rebalance != "":
    #    matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance)
    #    write_to_file(matrix_re, dict_keys, outfile)
    #else:
    data_util.write_to_file(matrix, dict_keys, outfile)
Exemple #12
0
def begin_search():
    f = open('./inverted_index/fileNumber.txt', 'r')
    global number_of_files
    number_of_files = int(f.read().strip())
    f.close()

    query_file = sys.argv[1]
    with open(query_file, 'r') as q:
        queries = q.readlines()
    data = ""
    for query in queries:
        global K
        K = query.split(', ')[0]
        K = int(K)
        query = query.split(', ')[1:]
        temp_query = ''
        for i in query:
            temp_query += i + ' '
        query = temp_query
        query = query.lower()
        start = timeit.default_timer()
        if re.match(r'[t|b|i|c|l]:', query):
            tempFields = re.findall(r'([t|b|c|i|l]):', query)
            words = re.findall(r'[t|b|c|i|l]:([^:]*)(?!\S)', query)
            # print(tempFields, words)
            fields, tokens = [], []
            si = len(words)
            i = 0
            while i < si:
                for word in words[i].split():
                    fields.append(tempFields[i])
                    tokens.append(word)
                i += 1
            tokens = remove_stopwords(tokens)
            tokens = stem(tokens)
            # print(fields, tokens)
            results = field_query_ranking(tokens, fields)
            # print(results)

        else:
            tokens = tokenise(query)
            tokens = remove_stopwords(tokens)
            tokens = stem(tokens)
            results = simple_query_ranking(tokens)
            # print(results)
        if len(results) > 0:
            results = sorted(results, key=results.get, reverse=True)
            if (len(results) > K):
                results = results[:K]
            for key in results:
                key.rstrip()
                title, title_doc_num = find_title(key)
                data += title_doc_num
                data += ', '
                # print(title_doc_num, end = ' ')
                if title is not None:
                    for i in title:
                        data += i + ' '
                        # print(i, end = ' ')
                    data = data[:-1]
        else:
            data += "No results found! Try modifying the search by reducing the length maybe?\n"
        end = timeit.default_timer()
        data += str(end - start) + ', '
        data += str((end - start) / K)
        data += '\n\n'
        # print('\n')
    # print('data', data)
    with open('queries_op.txt', 'w') as f:
        f.write(data)