def parse(CIK, filing_year, company_name=None, raw_website_data=None, \
    processed_website_data=None, get_legal_proceeding_only=False, get_litigation_footnotes_only=False):
    
    results = ParsingResults(CIK, filing_year, company_name, processed_text=processed_website_data)
    
    if results.company_name is None:
        results.company_name = edgar.get_name_of_company_from_cik(results.CIK)
    
    if results.processed_text is None:

        if raw_website_data is None:

            url = edgar.get_10k_url(CIK=results.CIK, filing_year=results.filing_year)
        
            if url is not None:
                response = urllib2.urlopen(url).read()
                CorpusAccess.write_raw_url_data_to_file(response, results.CIK, results.filing_year)
            else:
                raise Exception("Error: No URL to parse for data.")

        else:
            response = raw_website_data

        results.processed_text = convert_html_into_clean_text(response)                    

    _get_litigaton_mentions(results, get_legal_proceeding_only, get_litigation_footnotes_only)
    
    return results
def _get_raw_data(CIK, year):
    '''
        process-safe way of accessing a given 10-K as indexed
        by CIK and filing year. method will store the data to disk
        if it's not already there
    '''
    # maintain exclusive zone when acquiring raw data.
    # this section of the code could, based on OS scheduling, easily
    # lead to multiple download attempts of the same data.
    _corpus_access_mutex.acquire()

    raw_data = CorpusAccess.get_raw_website_data_from_corpus(CIK=CIK, filing_year=year)

    if raw_data is None:
        url = edgar.get_10k_url(CIK=CIK, filing_year=year)

        if url is not None:
            raw_data = urllib2.urlopen(url, timeout=Constants.URL_DOWNLOAD_TIMEOUT_IN_SECS).read()
            CorpusAccess.write_raw_url_data_to_file(raw_data, CIK=CIK, filing_year=year)

    _corpus_access_mutex.release()

    return raw_data