Example #1
0
    def generate_snippet(self, doc, query):
        fa = FileAccess()
        stop_words = fa.get_stop_words()
        query = query.split()
        stopped_content = query
        final_query = " ".join(stopped_content)

        fq_list = final_query.split()
        doc_list = doc.split()
        intr = list(set(doc_list).intersection(fq_list))

        positions = []
        for each in intr:
            if each in intr:
                key = doc_list.index(each)
                positions.append(key)
            else:
                continue
        final_doc = ''
        i = 0
        for each in doc_list:
            if i in positions:
                q = '"' + each + '" '
                final_doc += q
            else:
                final_doc += each + ' '
            i += 1

        return final_doc
Example #2
0
    def build_stopped_corpus(self):
        cwd = os.getcwd()
        clean_cacm = os.path.join(cwd, 'clean_cacm')
        stopped_cacm = os.path.join(cwd, 'stopped_cacm')
        fa = FileAccess()

        if not os.path.exists(clean_cacm):
            print "Clean corpus doesn't exist. It is created now. " \
                  "PLease put cleaned files inside the corpus folder"
            os.makedirs(clean_cacm, 0755)
            return
        if not os.path.exists(stopped_cacm):
            os.makedirs(stopped_cacm, 0755)

        stop_words = fa.get_stop_words()
        os.chdir(clean_cacm)

        for eachfile in glob.glob('*.html'):
            print eachfile
            content = open(eachfile).read()
            content = content.split()
            stopped_content = [x for x in content if x not in stop_words]
            final_content = " ".join(stopped_content)

            clean_file = open(os.path.join(stopped_cacm, eachfile), 'w')
            clean_file.write(final_content)
            clean_file.close()
Example #3
0
    def get_stopped_queries(self, query_dict):
        fa = FileAccess()
        query_dict = query_dict
        stop_words = fa.get_stop_words()
        stopped_queries = {}
        for each in query_dict:
            query = query_dict[each]
            query_list = query.split()
            stopped_query = [x for x in query_list if x not in stop_words]
            stopped_query = " ".join(stopped_query)
            stopped_queries[each] = stopped_query

        return stopped_queries